Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
a817075b
Commit
a817075b
authored
May 06, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update discarded block and spans build logic
parent
d4f96a05
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
27 additions
and
10 deletions
+27
-10
ocr_content_type.py
magic_pdf/libs/ocr_content_type.py
+1
-0
pdf_parse_union_core.py
magic_pdf/pdf_parse_union_core.py
+11
-6
ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+4
-1
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+11
-3
No files found.
magic_pdf/libs/ocr_content_type.py
View file @
a817075b
...
...
@@ -17,4 +17,5 @@ class BlockType:
Title
=
"title"
InterlineEquation
=
"interline_equation"
Footnote
=
"footnote"
Discarded
=
"discarded"
magic_pdf/pdf_parse_union_core.py
View file @
a817075b
...
...
@@ -17,7 +17,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from
magic_pdf.pre_proc.equations_replace
import
remove_chars_in_text_blocks
,
replace_equations_in_textblock
,
\
combine_chars_to_pymudict
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
,
\
fix_discarded_block
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
...
...
@@ -122,15 +123,19 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
spans
=
ocr_cut_image_and_table
(
spans
,
pdf_docs
[
page_id
],
page_id
,
pdf_bytes_md5
,
imageWriter
)
'''将所有区块的bbox整理到一起'''
all_bboxes
=
ocr_prepare_bboxes_for_layout_split
(
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equations
,
page_w
,
page_h
)
'''先处理不需要排版的discarded_blocks'''
discarded_block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_discarded_blocks
,
spans
,
0.4
)
fix_discarded_blocks
=
fix_discarded_block
(
discarded_block_with_spans
)
'''如果当前页面没有bbox则跳过'''
if
len
(
all_bboxes
)
==
0
:
logger
.
warning
(
f
"skip this page, not found bbox, page_id: {page_id}"
)
logger
.
warning
(
f
"skip this page, not found
useful
bbox, page_id: {page_id}"
)
return
ocr_construct_page_component_v2
([],
[],
page_id
,
page_w
,
page_h
,
[],
[],
[],
interline_equations
,
discarded_blocks
,
[],
[],
interline_equations
,
fix_
discarded_blocks
,
need_drop
,
drop_reason
)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
...
...
@@ -171,7 +176,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
'''将span填入排好序的blocks中'''
block_with_spans
=
fill_spans_in_blocks
(
sorted_blocks
,
spans
)
block_with_spans
,
spans
=
fill_spans_in_blocks
(
sorted_blocks
,
spans
,
0.6
)
'''对block进行fix操作'''
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
)
...
...
@@ -181,7 +186,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
'''构造pdf_info_dict'''
page_info
=
ocr_construct_page_component_v2
(
fix_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
discarded_blocks
,
images
,
tables
,
interline_equations
,
fix_
discarded_blocks
,
need_drop
,
drop_reason
)
return
page_info
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
a817075b
...
...
@@ -7,6 +7,7 @@ from magic_pdf.libs.ocr_content_type import BlockType
def
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
all_bboxes
=
[]
all_discarded_blocks
=
[]
for
image
in
img_blocks
:
x0
,
y0
,
x1
,
y1
=
image
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Image
,
None
,
None
,
None
,
None
])
...
...
@@ -38,10 +39,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50
%
区域的(限定footnote)'''
for
discarded
in
discarded_blocks
:
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
all_discarded_blocks
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Discarded
,
None
,
None
,
None
,
None
])
# 将footnote加入到all_bboxes中,用来计算layout
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Footnote
,
None
,
None
,
None
,
None
])
return
all_bboxes
return
all_bboxes
,
all_discarded_blocks
def
fix_text_overlap_title_blocks
(
all_bboxes
):
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
a817075b
...
...
@@ -141,7 +141,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
return
sort_blocks
def
fill_spans_in_blocks
(
blocks
,
spans
):
def
fill_spans_in_blocks
(
blocks
,
spans
,
radio
):
'''
将allspans中的span按位置关系,放入blocks中
'''
...
...
@@ -156,7 +156,7 @@ def fill_spans_in_blocks(blocks, spans):
block_spans
=
[]
for
span
in
spans
:
span_bbox
=
span
[
'bbox'
]
if
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.6
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
radio
:
block_spans
.
append
(
span
)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
...
...
@@ -178,7 +178,7 @@ def fill_spans_in_blocks(blocks, spans):
for
span
in
block_spans
:
spans
.
remove
(
span
)
return
block_with_spans
return
block_with_spans
,
spans
def
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
):
...
...
@@ -204,6 +204,14 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
return
fix_blocks
def
fix_discarded_block
(
discarded_block_with_spans
):
fix_discarded_blocks
=
[]
for
block
in
discarded_block_with_spans
:
block
=
fix_text_block
(
block
)
fix_discarded_blocks
.
append
(
block
)
return
fix_discarded_blocks
def
merge_spans_to_block
(
spans
:
list
,
block_bbox
:
list
,
block_type
:
str
):
block_spans
=
[]
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment