Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
d3c9cb84
Commit
d3c9cb84
authored
Mar 25, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
分段部分log限定在debug模式下才能输出
parent
8c089976
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
15 additions
and
12 deletions
+15
-12
para_split.py
magic_pdf/para/para_split.py
+14
-11
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+1
-1
No files found.
magic_pdf/para/para_split.py
View file @
d3c9cb84
...
...
@@ -501,7 +501,7 @@ def find_consecutive_true_regions(input_array):
return
regions
def
__connect_middle_align_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
):
def
__connect_middle_align_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
,
debug_mode
):
"""
找出来中间对齐的连续单行文本,如果连续行高度相同,那么合并为一个段落。
一个line居中的条件是:
...
...
@@ -527,8 +527,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
first_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
layout_para
[
start
][
0
][
'spans'
]])
if
"Table"
in
first_line_text
or
"Figure"
in
first_line_text
:
pass
logger
.
info
(
line_hi
.
std
())
if
debug_mode
:
logger
.
info
(
line_hi
.
std
())
if
line_hi
.
std
()
<
2
:
"""行高度相同,那么判断是否居中"""
...
...
@@ -540,7 +540,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
and
not
all
([
x1
==
layout_box
[
2
]
for
x1
in
all_right_x1
]):
merge_para
=
[
l
[
0
]
for
l
in
layout_para
[
start
:
end
+
1
]]
para_text
=
''
.
join
([
__get_span_text
(
span
)
for
line
in
merge_para
for
span
in
line
[
'spans'
]])
logger
.
info
(
para_text
)
if
debug_mode
:
logger
.
info
(
para_text
)
layout_para
[
start
:
end
+
1
]
=
[
merge_para
]
index_offset
-=
end
-
start
...
...
@@ -576,7 +577,7 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
return
connected_layout_paras
,
page_list_info
def
para_split
(
pdf_info_dict
,
lang
=
"en"
):
def
para_split
(
pdf_info_dict
,
debug_mode
,
lang
=
"en"
):
"""
根据line和layout情况进行分段
"""
...
...
@@ -601,13 +602,15 @@ def para_split(pdf_info_dict, lang="en"):
pre_page_layout_bbox
=
new_layout_of_pages
[
page_num
-
1
]
next_page_layout_bbox
=
new_layout_of_pages
[
page_num
]
is_conn
=
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
page_num
,
lang
)
if
is_conn
:
logger
.
info
(
f
"连接了第{page_num-1}页和第{page_num}页的段落"
)
is_conn
=
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
page_num
,
lang
)
if
debug_mode
:
if
is_conn
:
logger
.
info
(
f
"连接了第{page_num-1}页和第{page_num}页的段落"
)
is_list_conn
=
__connect_list_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
all_page_list_info
[
page_num
-
1
],
all_page_list_info
[
page_num
],
page_num
,
lang
)
if
is_list_conn
:
logger
.
info
(
f
"连接了第{page_num-1}页和第{page_num}页的列表段落"
)
if
debug_mode
:
if
is_list_conn
:
logger
.
info
(
f
"连接了第{page_num-1}页和第{page_num}页的列表段落"
)
"""接下来可能会漏掉一些特别的一些可以合并的内容,对他们进行段落连接
1. 正文中有时出现一个行顶格,接下来几行缩进的情况。
...
...
@@ -616,5 +619,5 @@ def para_split(pdf_info_dict, lang="en"):
for
page_num
,
page
in
enumerate
(
pdf_info_dict
.
values
()):
page_paras
=
page
[
'para_blocks'
]
new_layout_bbox
=
new_layout_of_pages
[
page_num
]
__connect_middle_align_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
)
__connect_middle_align_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
,
debug_mode
=
debug_mode
)
__merge_signle_list_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
)
magic_pdf/pdf_parse_by_ocr.py
View file @
d3c9cb84
...
...
@@ -269,7 +269,7 @@ def parse_pdf_by_ocr(
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
"""分段"""
para_split
(
pdf_info_dict
)
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
'''在测试时,保存调试信息'''
if
debug_mode
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment