Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
3c145ba0
Commit
3c145ba0
authored
Jun 06, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix: some text char removed by interline_equations overlap
parent
999b698f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
24 additions
and
9 deletions
+24
-9
equations_replace.py
magic_pdf/pre_proc/equations_replace.py
+5
-4
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+19
-5
No files found.
magic_pdf/pre_proc/equations_replace.py
View file @
3c145ba0
...
@@ -107,6 +107,7 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
...
@@ -107,6 +107,7 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
or
y0_1
>
y1_2
or
y0_1
>
y1_2
)
# box1在box2的下边
)
# box1在box2的下边
def
remove_text_block_overlap_interline_equation_bbox
(
def
remove_text_block_overlap_interline_equation_bbox
(
interline_eq_bboxes
,
pymu_block_list
interline_eq_bboxes
,
pymu_block_list
):
):
...
@@ -122,10 +123,10 @@ def remove_text_block_overlap_interline_equation_bbox(
...
@@ -122,10 +123,10 @@ def remove_text_block_overlap_interline_equation_bbox(
deleted_chars
=
[]
deleted_chars
=
[]
for
char
in
span
[
"chars"
]:
for
char
in
span
[
"chars"
]:
if
any
(
if
any
(
[
[
_is_in_or_part_overlap
(
char
[
"bbox"
],
eq_bbox
[
"bbox"
]
)
(
calculate_overlap_area_2_minbox_area_ratio
(
eq_bbox
[
"bbox"
],
char
[
"bbox"
])
>
0.5
)
for
eq_bbox
in
interline_eq_bboxes
for
eq_bbox
in
interline_eq_bboxes
]
]
):
):
deleted_chars
.
append
(
char
)
deleted_chars
.
append
(
char
)
# 检查span里没有char则删除这个span
# 检查span里没有char则删除这个span
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
3c145ba0
...
@@ -160,12 +160,12 @@ def fill_spans_in_blocks(blocks, spans, radio):
...
@@ -160,12 +160,12 @@ def fill_spans_in_blocks(blocks, spans, radio):
block_spans
.
append
(
span
)
block_spans
.
append
(
span
)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list
=
[]
#
displayed_list = []
text_inline_lines
=
[]
#
text_inline_lines = []
modify_y_axis
(
block_spans
,
displayed_list
,
text_inline_lines
)
#
modify_y_axis(block_spans, displayed_list, text_inline_lines)
'''模型识别错误的行间公式, type类型转换成行内公式'''
'''模型识别错误的行间公式, type类型转换成行内公式'''
block_spans
=
modify_inline_equation
(
block_spans
,
displayed_list
,
text_inline_lines
)
#
block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
'''bbox去除粘连'''
# 去粘连会影响span的bbox,导致后续fill的时候出错
'''bbox去除粘连'''
# 去粘连会影响span的bbox,导致后续fill的时候出错
# block_spans = remove_overlap_between_bbox_for_span(block_spans)
# block_spans = remove_overlap_between_bbox_for_span(block_spans)
...
@@ -196,8 +196,10 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
...
@@ -196,8 +196,10 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
block
=
fix_image_block
(
block
,
img_blocks
)
block
=
fix_image_block
(
block
,
img_blocks
)
elif
block_type
==
BlockType
.
Table
:
elif
block_type
==
BlockType
.
Table
:
block
=
fix_table_block
(
block
,
table_blocks
)
block
=
fix_table_block
(
block
,
table_blocks
)
elif
block_type
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
InterlineEquation
]:
elif
block_type
in
[
BlockType
.
Text
,
BlockType
.
Title
]:
block
=
fix_text_block
(
block
)
block
=
fix_text_block
(
block
)
elif
block_type
==
BlockType
.
InterlineEquation
:
block
=
fix_interline_block
(
block
)
else
:
else
:
continue
continue
fix_blocks
.
append
(
block
)
fix_blocks
.
append
(
block
)
...
@@ -315,6 +317,18 @@ def fix_table_block(block, table_blocks):
...
@@ -315,6 +317,18 @@ def fix_table_block(block, table_blocks):
def
fix_text_block
(
block
):
def
fix_text_block
(
block
):
# 文本block中的公式span都应该转换成行内type
for
span
in
block
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
InterlineEquation
:
span
[
'type'
]
=
ContentType
.
InlineEquation
block_lines
=
merge_spans_to_line
(
block
[
'spans'
])
sort_block_lines
=
line_sort_spans_by_left_to_right
(
block_lines
)
block
[
'lines'
]
=
sort_block_lines
del
block
[
'spans'
]
return
block
def
fix_interline_block
(
block
):
block_lines
=
merge_spans_to_line
(
block
[
'spans'
])
block_lines
=
merge_spans_to_line
(
block
[
'spans'
])
sort_block_lines
=
line_sort_spans_by_left_to_right
(
block_lines
)
sort_block_lines
=
line_sort_spans_by_left_to_right
(
block_lines
)
block
[
'lines'
]
=
sort_block_lines
block
[
'lines'
]
=
sort_block_lines
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment