Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
a48f1d14
Commit
a48f1d14
authored
Apr 29, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
skip complicated layout page
parent
f01cb89f
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
124 additions
and
38 deletions
+124
-38
pdf_parse_by_ocr_v2.py
magic_pdf/pdf_parse_by_ocr_v2.py
+32
-1
pdf_parse_by_txt_v2.py
magic_pdf/pdf_parse_by_txt_v2.py
+35
-5
resolve_bbox_conflict.py
magic_pdf/pre_proc/resolve_bbox_conflict.py
+57
-32
No files found.
magic_pdf/pdf_parse_by_ocr_v2.py
View file @
a48f1d14
...
...
@@ -2,8 +2,9 @@ import time
from
loguru
import
logger
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
,
LAYOUT_UNPROC
,
get_columns_cnt_of_layout
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.commons
import
fitz
,
get_delta_time
from
magic_pdf.model.magic_model
import
MagicModel
...
...
@@ -14,6 +15,7 @@ from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
# from magic_pdf.para.para_split import para_split
from
magic_pdf.para.para_split_v2
import
para_split
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
def
parse_pdf_by_ocr
(
pdf_bytes
,
...
...
@@ -63,10 +65,39 @@ def parse_pdf_by_ocr(pdf_bytes,
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equations
,
page_w
,
page_h
)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
useful_blocks
=
[]
for
bbox
in
all_bboxes
:
useful_blocks
.
append
({
"bbox"
:
bbox
[:
4
]
})
is_useful_block_horz_overlap
=
check_useful_block_horizontal_overlap
(
useful_blocks
)
if
is_useful_block_horz_overlap
:
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}"
)
continue
'''根据区块信息计算layout'''
page_boundry
=
[
0
,
0
,
page_w
,
page_h
]
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
all_bboxes
,
page_boundry
,
page_id
)
if
len
(
text_blocks
)
>
0
and
len
(
all_bboxes
)
>
0
and
len
(
layout_bboxes
)
==
0
:
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}"
)
continue
"""以下去掉复杂的布局和超过2列的布局"""
if
any
([
lay
[
"layout_label"
]
==
LAYOUT_UNPROC
for
lay
in
layout_bboxes
]):
# 复杂的布局
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}"
)
continue
layout_column_width
=
get_columns_cnt_of_layout
(
layout_tree
)
if
layout_column_width
>
2
:
# 去掉超过2列的布局pdf
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}"
)
continue
'''根据layout顺序,对当前页面所有需要留下的block进行排序'''
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
...
...
magic_pdf/pdf_parse_by_txt_v2.py
View file @
a48f1d14
...
...
@@ -2,8 +2,9 @@ import time
from
loguru
import
logger
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
,
LAYOUT_UNPROC
,
get_columns_cnt_of_layout
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.commons
import
fitz
,
get_delta_time
from
magic_pdf.model.magic_model
import
MagicModel
...
...
@@ -33,6 +34,8 @@ from magic_pdf.pre_proc.equations_replace import (
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
from
magic_pdf.libs.math
import
float_equal
from
magic_pdf.para.para_split_v2
import
para_split
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
def
txt_spans_extract
(
pdf_page
,
inline_equations
,
interline_equations
):
text_raw_blocks
=
pdf_page
.
get_text
(
"dict"
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
"blocks"
]
...
...
@@ -123,11 +126,38 @@ def parse_pdf_by_txt(
page_h
,
)
"""根据区块信息计算layout"""
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
useful_blocks
=
[]
for
bbox
in
all_bboxes
:
useful_blocks
.
append
({
"bbox"
:
bbox
[:
4
]
})
is_useful_block_horz_overlap
=
check_useful_block_horizontal_overlap
(
useful_blocks
)
if
is_useful_block_horz_overlap
:
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}"
)
continue
'''根据区块信息计算layout'''
page_boundry
=
[
0
,
0
,
page_w
,
page_h
]
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
all_bboxes
,
page_boundry
,
page_id
)
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
all_bboxes
,
page_boundry
,
page_id
)
if
len
(
text_blocks
)
>
0
and
len
(
all_bboxes
)
>
0
and
len
(
layout_bboxes
)
==
0
:
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}"
)
continue
"""以下去掉复杂的布局和超过2列的布局"""
if
any
([
lay
[
"layout_label"
]
==
LAYOUT_UNPROC
for
lay
in
layout_bboxes
]):
# 复杂的布局
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}"
)
continue
layout_column_width
=
get_columns_cnt_of_layout
(
layout_tree
)
if
layout_column_width
>
2
:
# 去掉超过2列的布局pdf
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}"
)
continue
"""根据layout顺序,对当前页面所有需要留下的block进行排序"""
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
...
...
magic_pdf/pre_proc/resolve_bbox_conflict.py
View file @
a48f1d14
"""
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
1. 首先去掉出现在图片上的bbox,图片包括表格和图片
...
...
@@ -9,7 +8,8 @@ from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_over
from
magic_pdf.libs.drop_tag
import
ON_IMAGE_TEXT
,
ON_TABLE_TEXT
def
resolve_bbox_overlap_conflict
(
images
:
list
,
tables
:
list
,
interline_equations
:
list
,
inline_equations
:
list
,
text_raw_blocks
:
list
):
def
resolve_bbox_overlap_conflict
(
images
:
list
,
tables
:
list
,
interline_equations
:
list
,
inline_equations
:
list
,
text_raw_blocks
:
list
):
"""
text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
当下采用一种粗暴的方式:
...
...
@@ -87,7 +87,7 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
# 图片和图片重叠,两张都暂时不参与版面计算
images_dup_index
=
[]
for
i
in
range
(
len
(
images
)):
for
j
in
range
(
i
+
1
,
len
(
images
)):
for
j
in
range
(
i
+
1
,
len
(
images
)):
if
_is_in_or_part_overlap
(
images
[
i
],
images
[
j
]):
images_dup_index
.
append
(
i
)
images_dup_index
.
append
(
j
)
...
...
@@ -121,25 +121,25 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
return
images
,
tables
,
interline_equations
,
inline_equations
,
text_raw_blocks
,
text_block_removed
,
images_backup
,
text_block_removed_2
def
check_text_block_horizontal_overlap
(
text_blocks
:
list
,
header
,
footer
)
->
bool
:
def
check_text_block_horizontal_overlap
(
text_blocks
:
list
,
header
,
footer
)
->
bool
:
"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
"""
if
len
(
text_blocks
)
==
0
:
if
len
(
text_blocks
)
==
0
:
return
False
page_min_y
=
0
page_max_y
=
max
(
yy
[
'bbox'
][
3
]
for
yy
in
text_blocks
)
def
__max_y
(
lst
:
list
):
if
len
(
lst
)
>
0
:
def
__max_y
(
lst
:
list
):
if
len
(
lst
)
>
0
:
return
max
([
item
[
1
]
for
item
in
lst
])
return
page_min_y
def
__min_y
(
lst
:
list
):
if
len
(
lst
)
>
0
:
def
__min_y
(
lst
:
list
):
if
len
(
lst
)
>
0
:
return
min
([
item
[
3
]
for
item
in
lst
])
return
page_max_y
...
...
@@ -149,13 +149,38 @@ def check_text_block_horizontal_overlap(text_blocks:list, header, footer) -> boo
txt_bboxes
=
[]
for
text_block
in
text_blocks
:
bbox
=
text_block
[
"bbox"
]
if
bbox
[
1
]
>=
clip_y0
and
bbox
[
3
]
<=
clip_y1
:
if
bbox
[
1
]
>=
clip_y0
and
bbox
[
3
]
<=
clip_y1
:
txt_bboxes
.
append
(
bbox
)
for
i
in
range
(
len
(
txt_bboxes
)):
for
j
in
range
(
i
+
1
,
len
(
txt_bboxes
)):
for
j
in
range
(
i
+
1
,
len
(
txt_bboxes
)):
if
_is_left_overlap
(
txt_bboxes
[
i
],
txt_bboxes
[
j
])
or
_is_left_overlap
(
txt_bboxes
[
j
],
txt_bboxes
[
i
]):
return
True
return
False
def
check_useful_block_horizontal_overlap
(
useful_blocks
:
list
)
->
bool
:
"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
"""
if
len
(
useful_blocks
)
==
0
:
return
False
page_min_y
=
0
page_max_y
=
max
(
yy
[
'bbox'
][
3
]
for
yy
in
useful_blocks
)
useful_bboxes
=
[]
for
text_block
in
useful_blocks
:
bbox
=
text_block
[
"bbox"
]
if
bbox
[
1
]
>=
page_min_y
and
bbox
[
3
]
<=
page_max_y
:
useful_bboxes
.
append
(
bbox
)
for
i
in
range
(
len
(
useful_bboxes
)):
for
j
in
range
(
i
+
1
,
len
(
useful_bboxes
)):
if
_is_left_overlap
(
useful_bboxes
[
i
],
useful_bboxes
[
j
])
or
_is_left_overlap
(
useful_bboxes
[
j
],
useful_bboxes
[
i
]):
return
True
return
False
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment