Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
8c089976
Commit
8c089976
authored
Mar 25, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
更新注释
parent
473a0a7d
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
34 additions
and
38 deletions
+34
-38
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+34
-38
No files found.
magic_pdf/pdf_parse_by_ocr.py
View file @
8c089976
...
@@ -57,16 +57,16 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
...
@@ -57,16 +57,16 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
def
parse_pdf_by_ocr
(
def
parse_pdf_by_ocr
(
pdf_path
,
pdf_path
,
s3_pdf_profile
,
s3_pdf_profile
,
pdf_model_output
,
pdf_model_output
,
save_path
,
save_path
,
book_name
,
book_name
,
pdf_model_profile
=
None
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
image_s3_config
=
None
,
start_page_id
=
0
,
start_page_id
=
0
,
end_page_id
=
None
,
end_page_id
=
None
,
debug_mode
=
False
,
debug_mode
=
False
,
):
):
pdf_bytes
=
read_file
(
pdf_path
,
s3_pdf_profile
)
pdf_bytes
=
read_file
(
pdf_path
,
s3_pdf_profile
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
...
@@ -95,7 +95,6 @@ def parse_pdf_by_ocr(
...
@@ -95,7 +95,6 @@ def parse_pdf_by_ocr(
start_time
=
time
.
time
()
start_time
=
time
.
time
()
end_page_id
=
end_page_id
if
end_page_id
else
len
(
pdf_docs
)
-
1
end_page_id
=
end_page_id
if
end_page_id
else
len
(
pdf_docs
)
-
1
for
page_id
in
range
(
start_page_id
,
end_page_id
+
1
):
for
page_id
in
range
(
start_page_id
,
end_page_id
+
1
):
...
@@ -199,50 +198,48 @@ def parse_pdf_by_ocr(
...
@@ -199,50 +198,48 @@ def parse_pdf_by_ocr(
else
:
else
:
continue
continue
'''删除重叠spans中较小的那些'''
# 删除重叠spans中较小的那些
spans
,
dropped_spans_by_span_overlap
=
remove_overlaps_min_spans
(
spans
)
spans
,
dropped_spans_by_span_overlap
=
remove_overlaps_min_spans
(
spans
)
# 删除remove_span_block_bboxes中的bbox
'''
# spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
删除remove_span_block_bboxes中的bbox
# 按qa要求,增加drop相关数据
并增加drop相关数据
'''
spans
,
dropped_spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
spans
,
dropped_spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
# 对image和table截图
'''对image和table截图'''
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
)
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list
=
[]
displayed_list
=
[]
text_inline_lines
=
[]
text_inline_lines
=
[]
modify_y_axis
(
spans
,
displayed_list
,
text_inline_lines
)
modify_y_axis
(
spans
,
displayed_list
,
text_inline_lines
)
# 模型识别错误的行间公式, type类型转换成行内公式
'''模型识别错误的行间公式, type类型转换成行内公式'''
spans
=
modify_inline_equation
(
spans
,
displayed_list
,
text_inline_lines
)
spans
=
modify_inline_equation
(
spans
,
displayed_list
,
text_inline_lines
)
# bbox去除粘连
'''bbox去除粘连'''
spans
=
remove_overlap_between_bbox
(
spans
)
spans
=
remove_overlap_between_bbox
(
spans
)
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
'''
对tpye=["interline_equation", "image", "table"]进行额外处理,
如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
'''
spans
=
adjust_bbox_for_standalone_block
(
spans
)
spans
=
adjust_bbox_for_standalone_block
(
spans
)
'''从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)'''
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes
,
layout_tree
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
],
page
,
ocr_page_info
)
layout_bboxes
,
layout_tree
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
],
page
,
ocr_page_info
)
# 将spans合并成line(在layout内,从上到下,从左到右)
'''将spans合并成line(在layout内,从上到下,从左到右)'''
lines
,
dropped_spans_by_layout
=
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
)
lines
,
dropped_spans_by_layout
=
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
)
# 将lines合并成block
'''将lines合并成block'''
blocks
=
merge_lines_to_block
(
lines
)
blocks
=
merge_lines_to_block
(
lines
)
# 根据block合并段落
'''获取QA需要外置的list'''
#para_blocks = para_split(blocks, layout_bboxes)
# 获取QA需要外置的list
images
,
tables
,
interline_equations
,
inline_equations
=
get_qa_need_list
(
blocks
)
images
,
tables
,
interline_equations
,
inline_equations
=
get_qa_need_list
(
blocks
)
# drop的span_list合并
'''drop的span_list合并'''
dropped_spans
=
[]
dropped_spans
=
[]
dropped_spans
.
extend
(
dropped_spans_by_span_overlap
)
dropped_spans
.
extend
(
dropped_spans_by_span_overlap
)
dropped_spans
.
extend
(
dropped_spans_by_removed_bboxes
)
dropped_spans
.
extend
(
dropped_spans_by_removed_bboxes
)
...
@@ -263,19 +260,18 @@ def parse_pdf_by_ocr(
...
@@ -263,19 +260,18 @@ def parse_pdf_by_ocr(
elif
span
[
'type'
]
in
[
ContentType
.
InlineEquation
,
ContentType
.
InterlineEquation
]:
elif
span
[
'type'
]
in
[
ContentType
.
InlineEquation
,
ContentType
.
InterlineEquation
]:
dropped_equation_block
.
append
(
span
)
dropped_equation_block
.
append
(
span
)
'''构造pdf_info_dict'''
# 构造pdf_info_dict
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
need_remove_spans_bboxes_dict
)
need_remove_spans_bboxes_dict
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
"""分段"""
"""分段"""
para_split
(
pdf_info_dict
)
para_split
(
pdf_info_dict
)
# 在测试时,保存调试信息
'''在测试时,保存调试信息'''
if
debug_mode
:
if
debug_mode
:
params_file_save_path
=
join_path
(
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment