Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
deb98fd0
Commit
deb98fd0
authored
May 08, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix footnote overlap error
parent
288bb074
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
11 additions
and
9 deletions
+11
-9
pdf_parse_union_core.py
magic_pdf/pdf_parse_union_core.py
+2
-2
ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+6
-4
resolve_bbox_conflict.py
magic_pdf/pre_proc/resolve_bbox_conflict.py
+3
-3
No files found.
magic_pdf/pdf_parse_union_core.py
View file @
deb98fd0
...
...
@@ -29,10 +29,10 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
useful_blocks
.
append
({
"bbox"
:
bbox
[:
4
]
})
is_useful_block_horz_overlap
,
smaller_bbox
=
check_useful_block_horizontal_overlap
(
useful_blocks
)
is_useful_block_horz_overlap
,
smaller_bbox
,
bigger_bbox
=
check_useful_block_horizontal_overlap
(
useful_blocks
)
if
is_useful_block_horz_overlap
:
logger
.
warning
(
f
"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}"
)
f
"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}
, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}
"
)
for
bbox
in
all_bboxes
.
copy
():
if
smaller_bbox
==
bbox
[:
4
]:
all_bboxes
.
remove
(
bbox
)
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
deb98fd0
...
...
@@ -34,10 +34,6 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
all_bboxes
=
fix_text_overlap_title_blocks
(
all_bboxes
)
'''任何框体与舍弃框重叠,优先信任舍弃框'''
all_bboxes
=
remove_need_drop_blocks
(
all_bboxes
,
discarded_blocks
)
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes
=
remove_overlaps_min_blocks
(
all_bboxes
)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes
=
remove_overlap_between_bbox_for_block
(
all_bboxes
)
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50
%
区域的(限定footnote)'''
for
discarded
in
discarded_blocks
:
...
...
@@ -47,6 +43,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Footnote
,
None
,
None
,
None
,
None
])
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes
=
remove_overlaps_min_blocks
(
all_bboxes
)
all_discarded_blocks
=
remove_overlaps_min_blocks
(
all_discarded_blocks
)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes
=
remove_overlap_between_bbox_for_block
(
all_bboxes
)
return
all_bboxes
,
all_discarded_blocks
...
...
magic_pdf/pre_proc/resolve_bbox_conflict.py
View file @
deb98fd0
...
...
@@ -184,8 +184,8 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
area_j
=
(
useful_bboxes
[
j
][
2
]
-
useful_bboxes
[
j
][
0
])
*
(
useful_bboxes
[
j
][
3
]
-
useful_bboxes
[
j
][
1
])
if
_is_left_overlap
(
useful_bboxes
[
i
],
useful_bboxes
[
j
])
or
_is_left_overlap
(
useful_bboxes
[
j
],
useful_bboxes
[
i
]):
if
area_i
>
area_j
:
return
True
,
useful_bboxes
[
j
]
return
True
,
useful_bboxes
[
j
]
,
useful_bboxes
[
i
]
else
:
return
True
,
useful_bboxes
[
i
]
return
True
,
useful_bboxes
[
i
]
,
useful_bboxes
[
j
]
return
False
,
None
return
False
,
None
,
None
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment