Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
a48f1d14
Commit
a48f1d14
authored
Apr 29, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
skip complicated layout page
parent
f01cb89f
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
124 additions
and
38 deletions
+124
-38
pdf_parse_by_ocr_v2.py
magic_pdf/pdf_parse_by_ocr_v2.py
+32
-1
pdf_parse_by_txt_v2.py
magic_pdf/pdf_parse_by_txt_v2.py
+35
-5
resolve_bbox_conflict.py
magic_pdf/pre_proc/resolve_bbox_conflict.py
+57
-32
No files found.
magic_pdf/pdf_parse_by_ocr_v2.py
View file @
a48f1d14
...
@@ -2,8 +2,9 @@ import time
...
@@ -2,8 +2,9 @@ import time
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
,
LAYOUT_UNPROC
,
get_columns_cnt_of_layout
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.commons
import
fitz
,
get_delta_time
from
magic_pdf.libs.commons
import
fitz
,
get_delta_time
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.model.magic_model
import
MagicModel
...
@@ -14,6 +15,7 @@ from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_
...
@@ -14,6 +15,7 @@ from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
# from magic_pdf.para.para_split import para_split
# from magic_pdf.para.para_split import para_split
from
magic_pdf.para.para_split_v2
import
para_split
from
magic_pdf.para.para_split_v2
import
para_split
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
def
parse_pdf_by_ocr
(
pdf_bytes
,
def
parse_pdf_by_ocr
(
pdf_bytes
,
...
@@ -63,10 +65,39 @@ def parse_pdf_by_ocr(pdf_bytes,
...
@@ -63,10 +65,39 @@ def parse_pdf_by_ocr(pdf_bytes,
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equations
,
page_w
,
page_h
)
interline_equations
,
page_w
,
page_h
)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
useful_blocks
=
[]
for
bbox
in
all_bboxes
:
useful_blocks
.
append
({
"bbox"
:
bbox
[:
4
]
})
is_useful_block_horz_overlap
=
check_useful_block_horizontal_overlap
(
useful_blocks
)
if
is_useful_block_horz_overlap
:
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}"
)
continue
'''根据区块信息计算layout'''
'''根据区块信息计算layout'''
page_boundry
=
[
0
,
0
,
page_w
,
page_h
]
page_boundry
=
[
0
,
0
,
page_w
,
page_h
]
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
all_bboxes
,
page_boundry
,
page_id
)
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
all_bboxes
,
page_boundry
,
page_id
)
if
len
(
text_blocks
)
>
0
and
len
(
all_bboxes
)
>
0
and
len
(
layout_bboxes
)
==
0
:
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}"
)
continue
"""以下去掉复杂的布局和超过2列的布局"""
if
any
([
lay
[
"layout_label"
]
==
LAYOUT_UNPROC
for
lay
in
layout_bboxes
]):
# 复杂的布局
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}"
)
continue
layout_column_width
=
get_columns_cnt_of_layout
(
layout_tree
)
if
layout_column_width
>
2
:
# 去掉超过2列的布局pdf
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}"
)
continue
'''根据layout顺序,对当前页面所有需要留下的block进行排序'''
'''根据layout顺序,对当前页面所有需要留下的block进行排序'''
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
...
...
magic_pdf/pdf_parse_by_txt_v2.py
View file @
a48f1d14
...
@@ -2,8 +2,9 @@ import time
...
@@ -2,8 +2,9 @@ import time
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
,
LAYOUT_UNPROC
,
get_columns_cnt_of_layout
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.commons
import
fitz
,
get_delta_time
from
magic_pdf.libs.commons
import
fitz
,
get_delta_time
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.model.magic_model
import
MagicModel
...
@@ -33,6 +34,8 @@ from magic_pdf.pre_proc.equations_replace import (
...
@@ -33,6 +34,8 @@ from magic_pdf.pre_proc.equations_replace import (
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
from
magic_pdf.libs.math
import
float_equal
from
magic_pdf.libs.math
import
float_equal
from
magic_pdf.para.para_split_v2
import
para_split
from
magic_pdf.para.para_split_v2
import
para_split
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
def
txt_spans_extract
(
pdf_page
,
inline_equations
,
interline_equations
):
def
txt_spans_extract
(
pdf_page
,
inline_equations
,
interline_equations
):
text_raw_blocks
=
pdf_page
.
get_text
(
"dict"
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
"blocks"
]
text_raw_blocks
=
pdf_page
.
get_text
(
"dict"
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
"blocks"
]
...
@@ -123,11 +126,38 @@ def parse_pdf_by_txt(
...
@@ -123,11 +126,38 @@ def parse_pdf_by_txt(
page_h
,
page_h
,
)
)
"""根据区块信息计算layout"""
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
useful_blocks
=
[]
for
bbox
in
all_bboxes
:
useful_blocks
.
append
({
"bbox"
:
bbox
[:
4
]
})
is_useful_block_horz_overlap
=
check_useful_block_horizontal_overlap
(
useful_blocks
)
if
is_useful_block_horz_overlap
:
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}"
)
continue
'''根据区块信息计算layout'''
page_boundry
=
[
0
,
0
,
page_w
,
page_h
]
page_boundry
=
[
0
,
0
,
page_w
,
page_h
]
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
all_bboxes
,
page_boundry
,
page_id
)
all_bboxes
,
page_boundry
,
page_id
)
if
len
(
text_blocks
)
>
0
and
len
(
all_bboxes
)
>
0
and
len
(
layout_bboxes
)
==
0
:
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}"
)
continue
"""以下去掉复杂的布局和超过2列的布局"""
if
any
([
lay
[
"layout_label"
]
==
LAYOUT_UNPROC
for
lay
in
layout_bboxes
]):
# 复杂的布局
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}"
)
continue
layout_column_width
=
get_columns_cnt_of_layout
(
layout_tree
)
if
layout_column_width
>
2
:
# 去掉超过2列的布局pdf
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}"
)
continue
"""根据layout顺序,对当前页面所有需要留下的block进行排序"""
"""根据layout顺序,对当前页面所有需要留下的block进行排序"""
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
...
...
magic_pdf/pre_proc/resolve_bbox_conflict.py
View file @
a48f1d14
"""
"""
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
1. 首先去掉出现在图片上的bbox,图片包括表格和图片
1. 首先去掉出现在图片上的bbox,图片包括表格和图片
...
@@ -9,7 +8,8 @@ from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_over
...
@@ -9,7 +8,8 @@ from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_over
from
magic_pdf.libs.drop_tag
import
ON_IMAGE_TEXT
,
ON_TABLE_TEXT
from
magic_pdf.libs.drop_tag
import
ON_IMAGE_TEXT
,
ON_TABLE_TEXT
def
resolve_bbox_overlap_conflict
(
images
:
list
,
tables
:
list
,
interline_equations
:
list
,
inline_equations
:
list
,
text_raw_blocks
:
list
):
def
resolve_bbox_overlap_conflict
(
images
:
list
,
tables
:
list
,
interline_equations
:
list
,
inline_equations
:
list
,
text_raw_blocks
:
list
):
"""
"""
text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
当下采用一种粗暴的方式:
当下采用一种粗暴的方式:
...
@@ -37,11 +37,11 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
...
@@ -37,11 +37,11 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
if
_is_in
(
text_bbox
,
table_box
):
if
_is_in
(
text_bbox
,
table_box
):
text_block
[
'tag'
]
=
ON_TABLE_TEXT
text_block
[
'tag'
]
=
ON_TABLE_TEXT
text_block_removed
.
append
(
text_block
)
text_block_removed
.
append
(
text_block
)
for
text_block
in
text_block_removed
:
for
text_block
in
text_block_removed
:
if
text_block
in
text_raw_blocks
:
if
text_block
in
text_raw_blocks
:
text_raw_blocks
.
remove
(
text_block
)
text_raw_blocks
.
remove
(
text_block
)
# 第一步去掉在图片上出现的公式box
# 第一步去掉在图片上出现的公式box
temp
=
[]
temp
=
[]
for
image_box
in
images
:
for
image_box
in
images
:
...
@@ -51,13 +51,13 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
...
@@ -51,13 +51,13 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
for
eq2
in
inline_equations
:
for
eq2
in
inline_equations
:
if
_is_in_or_part_overlap
(
image_box
,
eq2
[:
4
]):
if
_is_in_or_part_overlap
(
image_box
,
eq2
[:
4
]):
temp
.
append
(
eq2
)
temp
.
append
(
eq2
)
for
eq
in
temp
:
for
eq
in
temp
:
if
eq
in
interline_equations
:
if
eq
in
interline_equations
:
interline_equations
.
remove
(
eq
)
interline_equations
.
remove
(
eq
)
if
eq
in
inline_equations
:
if
eq
in
inline_equations
:
inline_equations
.
remove
(
eq
)
inline_equations
.
remove
(
eq
)
# 第二步去掉在表格上出现的公式box
# 第二步去掉在表格上出现的公式box
temp
=
[]
temp
=
[]
for
table_box
in
tables
:
for
table_box
in
tables
:
...
@@ -67,13 +67,13 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
...
@@ -67,13 +67,13 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
for
eq2
in
inline_equations
:
for
eq2
in
inline_equations
:
if
_is_in_or_part_overlap
(
table_box
,
eq2
[:
4
]):
if
_is_in_or_part_overlap
(
table_box
,
eq2
[:
4
]):
temp
.
append
(
eq2
)
temp
.
append
(
eq2
)
for
eq
in
temp
:
for
eq
in
temp
:
if
eq
in
interline_equations
:
if
eq
in
interline_equations
:
interline_equations
.
remove
(
eq
)
interline_equations
.
remove
(
eq
)
if
eq
in
inline_equations
:
if
eq
in
inline_equations
:
inline_equations
.
remove
(
eq
)
inline_equations
.
remove
(
eq
)
# 图片和文字重叠,丢掉图片
# 图片和文字重叠,丢掉图片
for
image_box
in
images
:
for
image_box
in
images
:
for
text_block
in
text_raw_blocks
:
for
text_block
in
text_raw_blocks
:
...
@@ -83,22 +83,22 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
...
@@ -83,22 +83,22 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
break
break
for
image_box
in
images_backup
:
for
image_box
in
images_backup
:
images
.
remove
(
image_box
)
images
.
remove
(
image_box
)
# 图片和图片重叠,两张都暂时不参与版面计算
# 图片和图片重叠,两张都暂时不参与版面计算
images_dup_index
=
[]
images_dup_index
=
[]
for
i
in
range
(
len
(
images
)):
for
i
in
range
(
len
(
images
)):
for
j
in
range
(
i
+
1
,
len
(
images
)):
for
j
in
range
(
i
+
1
,
len
(
images
)):
if
_is_in_or_part_overlap
(
images
[
i
],
images
[
j
]):
if
_is_in_or_part_overlap
(
images
[
i
],
images
[
j
]):
images_dup_index
.
append
(
i
)
images_dup_index
.
append
(
i
)
images_dup_index
.
append
(
j
)
images_dup_index
.
append
(
j
)
dup_idx
=
set
(
images_dup_index
)
dup_idx
=
set
(
images_dup_index
)
for
img_id
in
dup_idx
:
for
img_id
in
dup_idx
:
images_backup
.
append
(
images
[
img_id
])
images_backup
.
append
(
images
[
img_id
])
images
[
img_id
]
=
None
images
[
img_id
]
=
None
images
=
[
img
for
img
in
images
if
img
is
not
None
]
images
=
[
img
for
img
in
images
if
img
is
not
None
]
# 如果行间公式和文字block重叠,放到临时的数据里,防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
# 如果行间公式和文字block重叠,放到临时的数据里,防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
# 对于这样的文本块删除,然后保留行间公式的大小不变。
# 对于这样的文本块删除,然后保留行间公式的大小不变。
# 当计算完毕layout,这部分再合并回来
# 当计算完毕layout,这部分再合并回来
...
@@ -111,51 +111,76 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
...
@@ -111,51 +111,76 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
# text_block['tag'] = "belong-to-interline-equation"
# text_block['tag'] = "belong-to-interline-equation"
# text_block_removed_2.append(text_block)
# text_block_removed_2.append(text_block)
# break
# break
# for tb in text_block_removed_2:
# for tb in text_block_removed_2:
# if tb in text_raw_blocks:
# if tb in text_raw_blocks:
# text_raw_blocks.remove(tb)
# text_raw_blocks.remove(tb)
# text_block_removed = text_block_removed + text_block_removed_2
# text_block_removed = text_block_removed + text_block_removed_2
return
images
,
tables
,
interline_equations
,
inline_equations
,
text_raw_blocks
,
text_block_removed
,
images_backup
,
text_block_removed_2
return
images
,
tables
,
interline_equations
,
inline_equations
,
text_raw_blocks
,
text_block_removed
,
images_backup
,
text_block_removed_2
def
check_text_block_horizontal_overlap
(
text_blocks
:
list
,
header
,
footer
)
->
bool
:
def
check_text_block_horizontal_overlap
(
text_blocks
:
list
,
header
,
footer
)
->
bool
:
"""
"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
因为这种情况大概率发生了公式没有被检测出来。
"""
"""
if
len
(
text_blocks
)
==
0
:
if
len
(
text_blocks
)
==
0
:
return
False
return
False
page_min_y
=
0
page_min_y
=
0
page_max_y
=
max
(
yy
[
'bbox'
][
3
]
for
yy
in
text_blocks
)
page_max_y
=
max
(
yy
[
'bbox'
][
3
]
for
yy
in
text_blocks
)
def
__max_y
(
lst
:
list
):
def
__max_y
(
lst
:
list
):
if
len
(
lst
)
>
0
:
if
len
(
lst
)
>
0
:
return
max
([
item
[
1
]
for
item
in
lst
])
return
max
([
item
[
1
]
for
item
in
lst
])
return
page_min_y
return
page_min_y
def
__min_y
(
lst
:
list
):
def
__min_y
(
lst
:
list
):
if
len
(
lst
)
>
0
:
if
len
(
lst
)
>
0
:
return
min
([
item
[
3
]
for
item
in
lst
])
return
min
([
item
[
3
]
for
item
in
lst
])
return
page_max_y
return
page_max_y
clip_y0
=
__max_y
(
header
)
clip_y0
=
__max_y
(
header
)
clip_y1
=
__min_y
(
footer
)
clip_y1
=
__min_y
(
footer
)
txt_bboxes
=
[]
txt_bboxes
=
[]
for
text_block
in
text_blocks
:
for
text_block
in
text_blocks
:
bbox
=
text_block
[
"bbox"
]
bbox
=
text_block
[
"bbox"
]
if
bbox
[
1
]
>=
clip_y0
and
bbox
[
3
]
<=
clip_y1
:
if
bbox
[
1
]
>=
clip_y0
and
bbox
[
3
]
<=
clip_y1
:
txt_bboxes
.
append
(
bbox
)
txt_bboxes
.
append
(
bbox
)
for
i
in
range
(
len
(
txt_bboxes
)):
for
i
in
range
(
len
(
txt_bboxes
)):
for
j
in
range
(
i
+
1
,
len
(
txt_bboxes
)):
for
j
in
range
(
i
+
1
,
len
(
txt_bboxes
)):
if
_is_left_overlap
(
txt_bboxes
[
i
],
txt_bboxes
[
j
])
or
_is_left_overlap
(
txt_bboxes
[
j
],
txt_bboxes
[
i
]):
if
_is_left_overlap
(
txt_bboxes
[
i
],
txt_bboxes
[
j
])
or
_is_left_overlap
(
txt_bboxes
[
j
],
txt_bboxes
[
i
]):
return
True
return
True
return
False
def
check_useful_block_horizontal_overlap
(
useful_blocks
:
list
)
->
bool
:
"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
"""
if
len
(
useful_blocks
)
==
0
:
return
False
page_min_y
=
0
page_max_y
=
max
(
yy
[
'bbox'
][
3
]
for
yy
in
useful_blocks
)
useful_bboxes
=
[]
for
text_block
in
useful_blocks
:
bbox
=
text_block
[
"bbox"
]
if
bbox
[
1
]
>=
page_min_y
and
bbox
[
3
]
<=
page_max_y
:
useful_bboxes
.
append
(
bbox
)
for
i
in
range
(
len
(
useful_bboxes
)):
for
j
in
range
(
i
+
1
,
len
(
useful_bboxes
)):
if
_is_left_overlap
(
useful_bboxes
[
i
],
useful_bboxes
[
j
])
or
_is_left_overlap
(
useful_bboxes
[
j
],
useful_bboxes
[
i
]):
return
True
return
False
return
False
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment