Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
442f3684
Commit
442f3684
authored
Apr 29, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix complicated layout logic
parent
232964d0
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
82 additions
and
30 deletions
+82
-30
drop_reason.py
magic_pdf/libs/drop_reason.py
+1
-0
pdf_parse_by_ocr_v2.py
magic_pdf/pdf_parse_by_ocr_v2.py
+36
-14
pdf_parse_by_txt_v2.py
magic_pdf/pdf_parse_by_txt_v2.py
+35
-13
construct_page_dict.py
magic_pdf/pre_proc/construct_page_dict.py
+3
-1
resolve_bbox_conflict.py
magic_pdf/pre_proc/resolve_bbox_conflict.py
+7
-2
No files found.
magic_pdf/libs/drop_reason.py
View file @
442f3684
class
DropReason
:
class
DropReason
:
TEXT_BLCOK_HOR_OVERLAP
=
"text_block_horizontal_overlap"
# 文字块有水平互相覆盖,导致无法准确定位文字顺序
TEXT_BLCOK_HOR_OVERLAP
=
"text_block_horizontal_overlap"
# 文字块有水平互相覆盖,导致无法准确定位文字顺序
USEFUL_BLOCK_HOR_OVERLAP
=
"useful_block_horizontal_overlap"
# 需保留的block水平覆盖
COMPLICATED_LAYOUT
=
"complicated_layout"
# 复杂的布局,暂时不支持
COMPLICATED_LAYOUT
=
"complicated_layout"
# 复杂的布局,暂时不支持
TOO_MANY_LAYOUT_COLUMNS
=
"too_many_layout_columns"
# 目前不支持分栏超过2列的
TOO_MANY_LAYOUT_COLUMNS
=
"too_many_layout_columns"
# 目前不支持分栏超过2列的
COLOR_BACKGROUND_TEXT_BOX
=
"color_background_text_box"
# 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
COLOR_BACKGROUND_TEXT_BOX
=
"color_background_text_box"
# 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
...
...
magic_pdf/pdf_parse_by_ocr_v2.py
View file @
442f3684
...
@@ -18,6 +18,22 @@ from magic_pdf.para.para_split_v2 import para_split
...
@@ -18,6 +18,22 @@ from magic_pdf.para.para_split_v2 import para_split
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
def
remove_horizontal_overlap_block_which_smaller
(
all_bboxes
):
useful_blocks
=
[]
for
bbox
in
all_bboxes
:
useful_blocks
.
append
({
"bbox"
:
bbox
[:
4
]
})
is_useful_block_horz_overlap
,
smaller_bbox
=
check_useful_block_horizontal_overlap
(
useful_blocks
)
if
is_useful_block_horz_overlap
:
logger
.
warning
(
f
"skip this page, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}"
)
for
bbox
in
all_bboxes
.
copy
():
if
smaller_bbox
==
bbox
[:
4
]:
all_bboxes
.
remove
(
bbox
)
return
is_useful_block_horz_overlap
,
all_bboxes
def
parse_pdf_by_ocr
(
pdf_bytes
,
def
parse_pdf_by_ocr
(
pdf_bytes
,
model_list
,
model_list
,
imageWriter
,
imageWriter
,
...
@@ -25,6 +41,9 @@ def parse_pdf_by_ocr(pdf_bytes,
...
@@ -25,6 +41,9 @@ def parse_pdf_by_ocr(pdf_bytes,
end_page_id
=
None
,
end_page_id
=
None
,
debug_mode
=
False
,
debug_mode
=
False
,
):
):
need_drop
=
False
drop_reason
=
""
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
...
@@ -66,16 +85,14 @@ def parse_pdf_by_ocr(pdf_bytes,
...
@@ -66,16 +85,14 @@ def parse_pdf_by_ocr(pdf_bytes,
interline_equations
,
page_w
,
page_h
)
interline_equations
,
page_w
,
page_h
)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
useful_blocks
=
[]
for
bbox
in
all_bboxes
:
while
True
:
# 循环检查左右重叠的情况,如果存在就删除掉较小的那个bbox,直到不存在左右重叠的情况
useful_blocks
.
append
({
is_useful_block_horz_overlap
,
all_bboxes
=
remove_horizontal_overlap_block_which_smaller
(
all_bboxes
)
"bbox"
:
bbox
[:
4
]
if
is_useful_block_horz_overlap
:
})
need_drop
=
True
is_useful_block_horz_overlap
=
check_useful_block_horizontal_overlap
(
useful_blocks
)
drop_reason
=
DropReason
.
USEFUL_BLOCK_HOR_OVERLAP
if
is_useful_block_horz_overlap
:
else
:
logger
.
warning
(
break
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}"
)
continue
'''根据区块信息计算layout'''
'''根据区块信息计算layout'''
page_boundry
=
[
0
,
0
,
page_w
,
page_h
]
page_boundry
=
[
0
,
0
,
page_w
,
page_h
]
...
@@ -84,19 +101,23 @@ def parse_pdf_by_ocr(pdf_bytes,
...
@@ -84,19 +101,23 @@ def parse_pdf_by_ocr(pdf_bytes,
if
len
(
text_blocks
)
>
0
and
len
(
all_bboxes
)
>
0
and
len
(
layout_bboxes
)
==
0
:
if
len
(
text_blocks
)
>
0
and
len
(
all_bboxes
)
>
0
and
len
(
layout_bboxes
)
==
0
:
logger
.
warning
(
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}"
)
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}"
)
continue
need_drop
=
True
drop_reason
=
DropReason
.
CAN_NOT_DETECT_PAGE_LAYOUT
"""以下去掉复杂的布局和超过2列的布局"""
"""以下去掉复杂的布局和超过2列的布局"""
if
any
([
lay
[
"layout_label"
]
==
LAYOUT_UNPROC
for
lay
in
layout_bboxes
]):
# 复杂的布局
if
any
([
lay
[
"layout_label"
]
==
LAYOUT_UNPROC
for
lay
in
layout_bboxes
]):
# 复杂的布局
logger
.
warning
(
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}"
)
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}"
)
continue
need_drop
=
True
drop_reason
=
DropReason
.
COMPLICATED_LAYOUT
layout_column_width
=
get_columns_cnt_of_layout
(
layout_tree
)
layout_column_width
=
get_columns_cnt_of_layout
(
layout_tree
)
if
layout_column_width
>
2
:
# 去掉超过2列的布局pdf
if
layout_column_width
>
2
:
# 去掉超过2列的布局pdf
logger
.
warning
(
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}"
)
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}"
)
continue
need_drop
=
True
drop_reason
=
DropReason
.
TOO_MANY_LAYOUT_COLUMNS
'''根据layout顺序,对当前页面所有需要留下的block进行排序'''
'''根据layout顺序,对当前页面所有需要留下的block进行排序'''
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
...
@@ -119,7 +140,8 @@ def parse_pdf_by_ocr(pdf_bytes,
...
@@ -119,7 +140,8 @@ def parse_pdf_by_ocr(pdf_bytes,
'''构造pdf_info_dict'''
'''构造pdf_info_dict'''
page_info
=
ocr_construct_page_component_v2
(
fix_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
page_info
=
ocr_construct_page_component_v2
(
fix_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
discarded_blocks
)
images
,
tables
,
interline_equations
,
discarded_blocks
,
need_drop
,
drop_reason
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
"""分段"""
"""分段"""
...
...
magic_pdf/pdf_parse_by_txt_v2.py
View file @
442f3684
...
@@ -32,6 +32,22 @@ from magic_pdf.libs.math import float_equal
...
@@ -32,6 +32,22 @@ from magic_pdf.libs.math import float_equal
from
magic_pdf.para.para_split_v2
import
para_split
from
magic_pdf.para.para_split_v2
import
para_split
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
def
remove_horizontal_overlap_block_which_smaller
(
all_bboxes
):
useful_blocks
=
[]
for
bbox
in
all_bboxes
:
useful_blocks
.
append
({
"bbox"
:
bbox
[:
4
]
})
is_useful_block_horz_overlap
,
smaller_bbox
=
check_useful_block_horizontal_overlap
(
useful_blocks
)
if
is_useful_block_horz_overlap
:
logger
.
warning
(
f
"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}"
)
for
bbox
in
all_bboxes
.
copy
():
if
smaller_bbox
==
bbox
[:
4
]:
all_bboxes
.
remove
(
bbox
)
return
is_useful_block_horz_overlap
,
all_bboxes
def
txt_spans_extract
(
pdf_page
,
inline_equations
,
interline_equations
):
def
txt_spans_extract
(
pdf_page
,
inline_equations
,
interline_equations
):
text_raw_blocks
=
pdf_page
.
get_text
(
"dict"
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
"blocks"
]
text_raw_blocks
=
pdf_page
.
get_text
(
"dict"
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
"blocks"
]
...
@@ -91,6 +107,9 @@ def parse_pdf_by_txt(
...
@@ -91,6 +107,9 @@ def parse_pdf_by_txt(
end_page_id
=
None
,
end_page_id
=
None
,
debug_mode
=
False
,
debug_mode
=
False
,
):
):
need_drop
=
False
drop_reason
=
""
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
...
@@ -141,16 +160,14 @@ def parse_pdf_by_txt(
...
@@ -141,16 +160,14 @@ def parse_pdf_by_txt(
)
)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
useful_blocks
=
[]
for
bbox
in
all_bboxes
:
while
True
:
# 循环检查左右重叠的情况,如果存在就删除掉较小的那个bbox,直到不存在左右重叠的情况
useful_blocks
.
append
({
is_useful_block_horz_overlap
,
all_bboxes
=
remove_horizontal_overlap_block_which_smaller
(
all_bboxes
)
"bbox"
:
bbox
[:
4
]
if
is_useful_block_horz_overlap
:
})
need_drop
=
True
is_useful_block_horz_overlap
=
check_useful_block_horizontal_overlap
(
useful_blocks
)
drop_reason
=
DropReason
.
USEFUL_BLOCK_HOR_OVERLAP
if
is_useful_block_horz_overlap
:
else
:
logger
.
warning
(
break
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}"
)
continue
'''根据区块信息计算layout'''
'''根据区块信息计算layout'''
page_boundry
=
[
0
,
0
,
page_w
,
page_h
]
page_boundry
=
[
0
,
0
,
page_w
,
page_h
]
...
@@ -159,19 +176,22 @@ def parse_pdf_by_txt(
...
@@ -159,19 +176,22 @@ def parse_pdf_by_txt(
if
len
(
text_blocks
)
>
0
and
len
(
all_bboxes
)
>
0
and
len
(
layout_bboxes
)
==
0
:
if
len
(
text_blocks
)
>
0
and
len
(
all_bboxes
)
>
0
and
len
(
layout_bboxes
)
==
0
:
logger
.
warning
(
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}"
)
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}"
)
continue
need_drop
=
True
drop_reason
=
DropReason
.
CAN_NOT_DETECT_PAGE_LAYOUT
"""以下去掉复杂的布局和超过2列的布局"""
"""以下去掉复杂的布局和超过2列的布局"""
if
any
([
lay
[
"layout_label"
]
==
LAYOUT_UNPROC
for
lay
in
layout_bboxes
]):
# 复杂的布局
if
any
([
lay
[
"layout_label"
]
==
LAYOUT_UNPROC
for
lay
in
layout_bboxes
]):
# 复杂的布局
logger
.
warning
(
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}"
)
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}"
)
continue
need_drop
=
True
drop_reason
=
DropReason
.
COMPLICATED_LAYOUT
layout_column_width
=
get_columns_cnt_of_layout
(
layout_tree
)
layout_column_width
=
get_columns_cnt_of_layout
(
layout_tree
)
if
layout_column_width
>
2
:
# 去掉超过2列的布局pdf
if
layout_column_width
>
2
:
# 去掉超过2列的布局pdf
logger
.
warning
(
logger
.
warning
(
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}"
)
f
"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}"
)
continue
need_drop
=
True
drop_reason
=
DropReason
.
TOO_MANY_LAYOUT_COLUMNS
"""根据layout顺序,对当前页面所有需要留下的block进行排序"""
"""根据layout顺序,对当前页面所有需要留下的block进行排序"""
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
...
@@ -211,6 +231,8 @@ def parse_pdf_by_txt(
...
@@ -211,6 +231,8 @@ def parse_pdf_by_txt(
tables
,
tables
,
interline_equations
,
interline_equations
,
discarded_blocks
,
discarded_blocks
,
need_drop
,
drop_reason
)
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
...
...
magic_pdf/pre_proc/construct_page_dict.py
View file @
442f3684
...
@@ -55,7 +55,7 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h,
...
@@ -55,7 +55,7 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h,
def
ocr_construct_page_component_v2
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
def
ocr_construct_page_component_v2
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
discarded_blocks
):
images
,
tables
,
interline_equations
,
discarded_blocks
,
need_drop
,
drop_reason
):
return_dict
=
{
return_dict
=
{
'preproc_blocks'
:
blocks
,
'preproc_blocks'
:
blocks
,
'layout_bboxes'
:
layout_bboxes
,
'layout_bboxes'
:
layout_bboxes
,
...
@@ -66,5 +66,7 @@ def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page
...
@@ -66,5 +66,7 @@ def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page
'tables'
:
tables
,
'tables'
:
tables
,
'interline_equations'
:
interline_equations
,
'interline_equations'
:
interline_equations
,
'discarded_blocks'
:
discarded_blocks
,
'discarded_blocks'
:
discarded_blocks
,
'need_drop'
:
need_drop
,
'drop_reason'
:
drop_reason
,
}
}
return
return_dict
return
return_dict
magic_pdf/pre_proc/resolve_bbox_conflict.py
View file @
442f3684
...
@@ -180,7 +180,12 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
...
@@ -180,7 +180,12 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
for
i
in
range
(
len
(
useful_bboxes
)):
for
i
in
range
(
len
(
useful_bboxes
)):
for
j
in
range
(
i
+
1
,
len
(
useful_bboxes
)):
for
j
in
range
(
i
+
1
,
len
(
useful_bboxes
)):
area_i
=
(
useful_bboxes
[
i
][
2
]
-
useful_bboxes
[
i
][
0
])
*
(
useful_bboxes
[
i
][
3
]
-
useful_bboxes
[
i
][
1
])
area_j
=
(
useful_bboxes
[
j
][
2
]
-
useful_bboxes
[
j
][
0
])
*
(
useful_bboxes
[
j
][
3
]
-
useful_bboxes
[
j
][
1
])
if
_is_left_overlap
(
useful_bboxes
[
i
],
useful_bboxes
[
j
])
or
_is_left_overlap
(
useful_bboxes
[
j
],
useful_bboxes
[
i
]):
if
_is_left_overlap
(
useful_bboxes
[
i
],
useful_bboxes
[
j
])
or
_is_left_overlap
(
useful_bboxes
[
j
],
useful_bboxes
[
i
]):
return
True
if
area_i
>
area_j
:
return
True
,
useful_bboxes
[
j
]
else
:
return
True
,
useful_bboxes
[
i
]
return
False
return
False
,
None
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment