Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
a0135640
Commit
a0135640
authored
Mar 15, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修复spans为空list导致的IndexError: list index out of range
parent
f10b4a50
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
89 additions
and
83 deletions
+89
-83
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+29
-26
ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+60
-57
No files found.
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
a0135640
...
@@ -24,34 +24,37 @@ def line_sort_spans_by_left_to_right(lines):
...
@@ -24,34 +24,37 @@ def line_sort_spans_by_left_to_right(lines):
return
line_objects
return
line_objects
def
merge_spans_to_line
(
spans
):
def
merge_spans_to_line
(
spans
):
# 按照y0坐标排序
if
len
(
spans
)
==
0
:
spans
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
1
])
return
[]
else
:
lines
=
[]
# 按照y0坐标排序
current_line
=
[
spans
[
0
]]
spans
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
1
])
for
span
in
spans
[
1
:]:
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
lines
=
[]
# image和table类型,同上
current_line
=
[
spans
[
0
]]
if
span
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
or
any
(
for
span
in
spans
[
1
:]:
s
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
for
s
in
current_line
):
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# 则开始新行
# image和table类型,同上
lines
.
append
(
current_line
)
if
span
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
or
any
(
current_line
=
[
span
]
s
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
for
s
in
current_line
):
continue
# 则开始新行
lines
.
append
(
current_line
)
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
current_line
=
[
span
]
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
]):
continue
current_line
.
append
(
span
)
else
:
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
# 否则,开始新行
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
]):
current_line
.
append
(
span
)
else
:
# 否则,开始新行
lines
.
append
(
current_line
)
current_line
=
[
span
]
# 添加最后一行
if
current_line
:
lines
.
append
(
current_line
)
lines
.
append
(
current_line
)
current_line
=
[
span
]
# 添加最后一行
return
lines
if
current_line
:
lines
.
append
(
current_line
)
return
lines
def
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
):
def
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
):
lines
=
[]
lines
=
[]
...
...
magic_pdf/pre_proc/ocr_span_list_modify.py
View file @
a0135640
...
@@ -77,70 +77,73 @@ def adjust_bbox_for_standalone_block(spans):
...
@@ -77,70 +77,73 @@ def adjust_bbox_for_standalone_block(spans):
def
modify_y_axis
(
spans
:
list
,
displayed_list
:
list
,
text_inline_lines
:
list
):
def
modify_y_axis
(
spans
:
list
,
displayed_list
:
list
,
text_inline_lines
:
list
):
# displayed_list = []
# displayed_list = []
# 如果spans为空,则不处理
if
len
(
spans
)
==
0
:
pass
else
:
spans
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
1
])
lines
=
[]
current_line
=
[
spans
[
0
]]
if
spans
[
0
][
"type"
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]:
displayed_list
.
append
(
spans
[
0
])
line_first_y0
=
spans
[
0
][
"bbox"
][
1
]
line_first_y
=
spans
[
0
][
"bbox"
][
3
]
# 用于给行间公式搜索
# text_inline_lines = []
for
span
in
spans
[
1
:]:
# if span.get("content","") == "78.":
# print("debug")
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if
span
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
or
any
(
s
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
for
s
in
current_line
):
# 传入
if
span
[
"type"
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]:
displayed_list
.
append
(
span
)
# 则开始新行
lines
.
append
(
current_line
)
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
current_line
=
[
span
]
line_first_y0
=
span
[
"bbox"
][
1
]
line_first_y
=
span
[
"bbox"
][
3
]
continue
spans
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
1
])
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
]):
lines
=
[]
if
span
[
"type"
]
==
"text"
:
current_line
=
[
spans
[
0
]
]
line_first_y0
=
span
[
"bbox"
][
1
]
if
spans
[
0
][
"type"
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]:
line_first_y
=
span
[
"bbox"
][
3
]
displayed_list
.
append
(
spans
[
0
]
)
current_line
.
append
(
span
)
line_first_y0
=
spans
[
0
][
"bbox"
][
1
]
else
:
line_first_y
=
spans
[
0
][
"bbox"
][
3
]
# 否则,开始新行
# 用于给行间公式搜索
lines
.
append
(
current_line
)
# text_inline_lines = []
for
span
in
spans
[
1
:]:
# if span.get("content","") == "78.":
# print("debug")
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if
span
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
or
any
(
s
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
for
s
in
current_line
):
# 传入
if
span
[
"type"
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]:
displayed_list
.
append
(
span
)
# 则开始新行
lines
.
append
(
current_line
)
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
current_line
=
[
span
]
current_line
=
[
span
]
line_first_y0
=
span
[
"bbox"
][
1
]
line_first_y
=
span
[
"bbox"
][
3
]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
]):
if
span
[
"type"
]
==
"text"
:
line_first_y0
=
span
[
"bbox"
][
1
]
line_first_y0
=
span
[
"bbox"
][
1
]
line_first_y
=
span
[
"bbox"
][
3
]
line_first_y
=
span
[
"bbox"
][
3
]
current_line
.
append
(
span
)
else
:
# 添加最后一行
# 否则,开始新行
if
current_line
:
lines
.
append
(
current_line
)
lines
.
append
(
current_line
)
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
current_line
=
[
span
]
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
line_first_y0
=
span
[
"bbox"
][
1
]
for
line
in
text_inline_lines
:
line_first_y
=
span
[
"bbox"
][
3
]
# 按照x0坐标排序
current_line
=
line
[
0
]
# 添加最后一行
current_line
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
0
])
if
current_line
:
lines
.
append
(
current_line
)
# 调整每一个文字行内bbox统一
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
for
line
in
text_inline_lines
:
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
current_line
,
(
line_first_y0
,
line_first_y
)
=
line
for
line
in
text_inline_lines
:
for
span
in
current_line
:
# 按照x0坐标排序
span
[
"bbox"
][
1
]
=
line_first_y0
current_line
=
line
[
0
]
span
[
"bbox"
][
3
]
=
line_first_y
current_line
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
0
])
# return spans, displayed_list, text_inline_lines
# 调整每一个文字行内bbox统一
for
line
in
text_inline_lines
:
current_line
,
(
line_first_y0
,
line_first_y
)
=
line
for
span
in
current_line
:
span
[
"bbox"
][
1
]
=
line_first_y0
span
[
"bbox"
][
3
]
=
line_first_y
# return spans, displayed_list, text_inline_lines
def
modify_inline_equation
(
spans
:
list
,
displayed_list
:
list
,
text_inline_lines
:
list
):
def
modify_inline_equation
(
spans
:
list
,
displayed_list
:
list
,
text_inline_lines
:
list
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment