Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
d438b97a
Commit
d438b97a
authored
Apr 16, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
切图逻辑重构
parent
1f186f5f
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
69 additions
and
75 deletions
+69
-75
pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+7
-45
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+6
-7
pdf_parse_by_txt.py
magic_pdf/pdf_parse_by_txt.py
+2
-3
pdf_parse_for_train.py
magic_pdf/pdf_parse_for_train.py
+2
-2
cut_image.py
magic_pdf/pre_proc/cut_image.py
+52
-0
ocr_cut_image.py
magic_pdf/pre_proc/ocr_cut_image.py
+0
-18
No files found.
magic_pdf/libs/pdf_image_tools.py
View file @
d438b97a
from
loguru
import
logger
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.commons
import
fitz
from
loguru
import
logger
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.hash_utils
import
compute_sha256
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
:
AbsReaderWriter
):
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
:
AbsReaderWriter
):
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
...
...
@@ -19,6 +20,10 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
# 新版本生成平铺路径
img_hash256_path
=
f
"{compute_sha256(img_path)}.jpg"
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"image_bboxes: 错误的box, {bbox}"
)
return
img_hash256_path
# 将坐标转换为fitz.Rect对象
rect
=
fitz
.
Rect
(
*
bbox
)
# 配置缩放倍数为3倍
...
...
@@ -31,46 +36,3 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
imageWriter
.
write
(
byte_data
,
img_hash256_path
,
AbsReaderWriter
.
MODE_BIN
)
return
img_hash256_path
def
save_images_by_bboxes
(
page_num
:
int
,
page
:
fitz
.
Page
,
pdf_bytes_md5
:
str
,
image_bboxes
:
list
,
images_overlap_backup
:
list
,
table_bboxes
:
list
,
equation_inline_bboxes
:
list
,
equation_interline_bboxes
:
list
,
imageWriter
)
->
dict
:
"""
返回一个dict, key为bbox, 值是图片地址
"""
image_info
=
[]
image_backup_info
=
[]
table_info
=
[]
inline_eq_info
=
[]
interline_eq_info
=
[]
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
def
return_path
(
type
):
return
join_path
(
pdf_bytes_md5
,
type
)
for
bbox
in
image_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"image_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
images_overlap_backup
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"images_overlap_backup: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_backup_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
table_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"table_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
\ No newline at end of file
magic_pdf/pdf_parse_by_ocr.py
View file @
d438b97a
...
...
@@ -16,7 +16,7 @@ from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from
magic_pdf.pre_proc.detect_footnote
import
parse_footnotes_by_model
from
magic_pdf.pre_proc.detect_header
import
parse_headers
from
magic_pdf.pre_proc.detect_page_number
import
parse_pageNos
from
magic_pdf.pre_proc.
ocr_cut_image
import
cut_image_and_table
from
magic_pdf.pre_proc.
cut_image
import
ocr_
cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
merge_spans_to_line_by_layout
,
merge_lines_to_block
,
...
...
@@ -27,7 +27,6 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
def
parse_pdf_by_ocr
(
pdf_bytes
,
pdf_model_output
,
...
...
@@ -148,7 +147,7 @@ def parse_pdf_by_ocr(
spans
,
dropped_spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
'''对image和table截图'''
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
)
spans
=
ocr_
cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list
=
[]
...
...
@@ -202,10 +201,10 @@ def parse_pdf_by_ocr(
'''构造pdf_info_dict'''
page_info
=
ocr_construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
need_remove_spans_bboxes_dict
)
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
need_remove_spans_bboxes_dict
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
"""分段"""
...
...
magic_pdf/pdf_parse_by_txt.py
View file @
d438b97a
...
...
@@ -17,6 +17,7 @@ from magic_pdf.libs.hash_utils import compute_md5
from
magic_pdf.libs.markdown_utils
import
escape_special_markdown_char
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.libs.vis_utils
import
draw_bbox_on_page
,
draw_layout_bbox_on_page
from
magic_pdf.pre_proc.cut_image
import
txt_save_images_by_bboxes
from
magic_pdf.pre_proc.detect_images
import
parse_images
from
magic_pdf.pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
magic_pdf.pre_proc.detect_equation
import
parse_equations
# 获取equations的bbox
...
...
@@ -48,8 +49,6 @@ from para.exceptions import (
)
'''
from
magic_pdf.libs.commons
import
read_file
,
join_path
from
magic_pdf.libs.pdf_image_tools
import
save_images_by_bboxes
from
magic_pdf.post_proc.remove_footnote
import
merge_footnote_blocks
,
remove_footnote_blocks
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
from
magic_pdf.pre_proc.equations_replace
import
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
...
...
@@ -194,7 +193,7 @@ def parse_pdf_by_txt(
"""
# 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
=
save_images_by_bboxes
(
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
=
txt_
save_images_by_bboxes
(
page_id
,
page
,
pdf_bytes_md5
,
...
...
magic_pdf/pdf_parse_for_train.py
View file @
d438b97a
...
...
@@ -26,6 +26,7 @@ from magic_pdf.libs.drop_reason import DropReason
from
magic_pdf.libs.markdown_utils
import
escape_special_markdown_char
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.libs.vis_utils
import
draw_bbox_on_page
,
draw_layout_bbox_on_page
from
magic_pdf.pre_proc.cut_image
import
txt_save_images_by_bboxes
from
magic_pdf.pre_proc.detect_images
import
parse_images
from
magic_pdf.pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
magic_pdf.pre_proc.detect_equation
import
parse_equations
# 获取equations的bbox
...
...
@@ -62,7 +63,6 @@ from para.exceptions import (
"""
from
magic_pdf.libs.commons
import
read_file
,
join_path
from
magic_pdf.libs.pdf_image_tools
import
save_images_by_bboxes
from
magic_pdf.post_proc.remove_footnote
import
(
merge_footnote_blocks
,
remove_footnote_blocks
,
...
...
@@ -323,7 +323,7 @@ def parse_pdf_for_train(
# 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
=
(
save_images_by_bboxes
(
txt_
save_images_by_bboxes
(
book_name
,
page_id
,
page
,
...
...
magic_pdf/pre_proc/cut_image.py
0 → 100644
View file @
d438b97a
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.pdf_image_tools
import
cut_image
def
ocr_cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
):
def
return_path
(
type
):
return
join_path
(
pdf_bytes_md5
,
type
)
for
span
in
spans
:
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Image
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
imageWriter
=
imageWriter
)
elif
span_type
==
ContentType
.
Table
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
imageWriter
=
imageWriter
)
return
spans
def
txt_save_images_by_bboxes
(
page_num
:
int
,
page
,
pdf_bytes_md5
:
str
,
image_bboxes
:
list
,
images_overlap_backup
:
list
,
table_bboxes
:
list
,
equation_inline_bboxes
:
list
,
equation_interline_bboxes
:
list
,
imageWriter
)
->
dict
:
"""
返回一个dict, key为bbox, 值是图片地址
"""
image_info
=
[]
image_backup_info
=
[]
table_info
=
[]
inline_eq_info
=
[]
interline_eq_info
=
[]
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
def
return_path
(
type
):
return
join_path
(
pdf_bytes_md5
,
type
)
for
bbox
in
image_bboxes
:
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
images_overlap_backup
:
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_backup_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
table_bboxes
:
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
magic_pdf/pre_proc/ocr_cut_image.py
deleted
100644 → 0
View file @
1f186f5f
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.pdf_image_tools
import
cut_image
def
cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
):
def
return_path
(
type
):
return
join_path
(
pdf_bytes_md5
,
type
)
for
span
in
spans
:
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Image
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
imageWriter
=
imageWriter
)
elif
span_type
==
ContentType
.
Table
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
imageWriter
=
imageWriter
)
return
spans
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment