Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
5c046cca
Unverified
Commit
5c046cca
authored
May 21, 2024
by
myhloli
Committed by
GitHub
May 21, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #111 from papayalove/master
分段代码规范化
parents
97e67881
330a7cd9
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
5 additions
and
20 deletions
+5
-20
para_split_v2.py
magic_pdf/para/para_split_v2.py
+5
-20
No files found.
magic_pdf/para/para_split_v2.py
View file @
5c046cca
...
...
@@ -258,29 +258,19 @@ def __pre_proc_en_blocks(blocks, layout_bboxes):
pass
def
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
=
"en"
):
def
__group_line_by_layout
(
blocks
,
layout_bboxes
):
"""
每个layout内的行进行聚合
"""
# 因为只是一个block一行目前, 一个block就是一个段落
blocks_group
=
[]
for
lyout
in
layout_bboxes
:
#lines = [line for block in blocks if block["type"] == BlockType.Text and is_in_layout(block['bbox'], lyout['layout_bbox']) for line in
# block['lines']]
blocks_in_layout
=
[
block
for
block
in
blocks
if
is_in_layout
(
block
[
'bbox'
],
lyout
[
'layout_bbox'
])]
blocks_group
.
append
(
blocks_in_layout
)
return
blocks_group
def
__split_para_in_layoutbox2
(
lines_group
,
new_layout_bbox
,
lang
=
"en"
,
char_avg_len
=
10
):
"""
"""
def
__split_para_in_layoutbox
(
blocks_group
,
new_layout_bbox
,
lang
=
"en"
,
char_avg_len
=
10
):
def
__split_para_in_layoutbox
(
blocks_group
,
new_layout_bbox
,
lang
=
"en"
):
"""
lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
1. 先计算每个group的左右边界。
...
...
@@ -329,9 +319,6 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en", char_avg
index
=
list_start
[
i
]
-
1
if
"content"
in
lines
[
index
][
"spans"
][
-
1
]:
lines
[
index
][
"spans"
][
-
1
][
"content"
]
+=
'
\n\n
'
# layout_right = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[2]
# layout_left = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]
para
=
[]
# 元素是line
layout_list_info
=
[
False
,
False
]
# 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
for
content_type
,
start
,
end
in
text_segments
:
if
content_type
==
'list'
:
...
...
@@ -340,7 +327,6 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en", char_avg
if
end
==
total_lines
-
1
and
is_end_list
is
None
:
layout_list_info
[
1
]
=
True
# paras = __split_para_lines(lines, text_blocks)
list_info
.
append
(
layout_list_info
)
return
list_info
...
...
@@ -472,7 +458,7 @@ def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
return
None
def
__connect_para_inter_layoutbox
(
blocks_group
,
new_layout_bbox
,
lang
):
def
__connect_para_inter_layoutbox
(
blocks_group
,
new_layout_bbox
):
"""
layout之间进行分段。
主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
...
...
@@ -481,7 +467,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox, lang):
2. 下一行开头不留空白。
"""
connected_layout_paras
=
[]
connected_layout_blocks
=
[]
if
len
(
blocks_group
)
==
0
:
return
connected_layout_blocks
...
...
@@ -689,11 +674,11 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
3. 参照上述行尾特征进行分段。
4. 图、表,目前独占一行,不考虑分段。
"""
blocks_group
=
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
)
# block内分段
blocks_group
=
__group_line_by_layout
(
blocks
,
layout_bboxes
)
# block内分段
layout_list_info
=
__split_para_in_layoutbox
(
blocks_group
,
new_layout_bbox
,
lang
)
# layout内分段
blocks_group
,
page_list_info
=
__connect_list_inter_layout
(
blocks_group
,
new_layout_bbox
,
layout_list_info
,
page_num
,
lang
)
# layout之间连接列表段落
connected_layout_blocks
=
__connect_para_inter_layoutbox
(
blocks_group
,
new_layout_bbox
,
lang
)
# layout间链接段落
connected_layout_blocks
=
__connect_para_inter_layoutbox
(
blocks_group
,
new_layout_bbox
)
# layout间链接段落
return
connected_layout_blocks
,
page_list_info
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment