Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
10a95bcd
Unverified
Commit
10a95bcd
authored
May 24, 2024
by
myhloli
Committed by
GitHub
May 24, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #114 from papayalove/master
修复list拼接和reference分行问题
parents
3711a333
dbdbaf58
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
57 additions
and
29 deletions
+57
-29
para_split_v2.py
magic_pdf/para/para_split_v2.py
+57
-29
No files found.
magic_pdf/para/para_split_v2.py
View file @
10a95bcd
from
sklearn.cluster
import
DBSCAN
from
sklearn.cluster
import
DBSCAN
import
numpy
as
np
import
numpy
as
np
from
loguru
import
logger
from
loguru
import
logger
import
re
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap_with_area_ratio
as
is_in_layout
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap_with_area_ratio
as
is_in_layout
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.model.magic_model
import
MagicModel
...
@@ -106,16 +106,18 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
...
@@ -106,16 +106,18 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
3. 如果非顶格,首字符大写,编码为2
3. 如果非顶格,首字符大写,编码为2
4. 如果非顶格,首字符非大写编码为3
4. 如果非顶格,首字符非大写编码为3
"""
"""
x_map_tag_dict
,
min_x_tag
=
cluster_line_x
(
lines
)
for
l
in
lines
:
for
l
in
lines
:
first_char
=
__get_span_text
(
l
[
'spans'
][
0
])[
0
]
span_text
=
__get_span_text
(
l
[
'spans'
][
0
])
first_char
=
span_text
[
0
]
layout
=
__find_layout_bbox_by_line
(
l
[
'bbox'
],
new_layout_bboxes
)
layout
=
__find_layout_bbox_by_line
(
l
[
'bbox'
],
new_layout_bboxes
)
if
not
layout
:
if
not
layout
:
line_fea_encode
.
append
(
0
)
line_fea_encode
.
append
(
0
)
else
:
else
:
layout_left
=
layout
[
0
]
#
if
l
[
'bbox'
][
0
]
==
layout_left
:
if
x_map_tag_dict
[
round
(
l
[
'bbox'
][
0
])]
==
min_x_tag
:
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
if
not
first_char
.
isalnum
():
if
not
first_char
.
isalnum
()
or
if_match_reference_list
(
span_text
)
:
line_fea_encode
.
append
(
1
)
line_fea_encode
.
append
(
1
)
else
:
else
:
line_fea_encode
.
append
(
4
)
line_fea_encode
.
append
(
4
)
...
@@ -144,6 +146,36 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
...
@@ -144,6 +146,36 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
return
split_indices
(
total_lines
,
list_indice
),
list_start_idx
return
split_indices
(
total_lines
,
list_indice
),
list_start_idx
def
cluster_line_x
(
lines
:
list
)
->
dict
:
"""
对一个block内所有lines的bbox的x0聚类
"""
min_distance
=
5
min_sample
=
1
x0_lst
=
np
.
array
([[
round
(
line
[
'bbox'
][
0
]),
0
]
for
line
in
lines
])
x0_clusters
=
DBSCAN
(
eps
=
min_distance
,
min_samples
=
min_sample
)
.
fit
(
x0_lst
)
x0_uniq_label
=
np
.
unique
(
x0_clusters
.
labels_
)
#x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
x0_2_new_val
=
{}
# 存储旧值对应的新值映射
min_x0
=
round
(
lines
[
0
][
"bbox"
][
0
])
for
label
in
x0_uniq_label
:
if
label
==
-
1
:
continue
x0_index_of_label
=
np
.
where
(
x0_clusters
.
labels_
==
label
)
x0_raw_val
=
x0_lst
[
x0_index_of_label
][:,
0
]
x0_new_val
=
np
.
min
(
x0_lst
[
x0_index_of_label
][:,
0
])
x0_2_new_val
.
update
({
round
(
raw_val
):
round
(
x0_new_val
)
for
raw_val
in
x0_raw_val
})
if
x0_new_val
<
min_x0
:
min_x0
=
x0_new_val
return
x0_2_new_val
,
min_x0
def
if_match_reference_list
(
text
:
str
)
->
bool
:
pattern
=
re
.
compile
(
r'^\d+\..*'
)
if
pattern
.
match
(
text
):
return
True
else
:
return
False
def
__valign_lines
(
blocks
,
layout_bboxes
):
def
__valign_lines
(
blocks
,
layout_bboxes
):
"""
"""
...
@@ -315,8 +347,9 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
...
@@ -315,8 +347,9 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
"""
"""
for
list_start
in
list_start_line
:
for
list_start
in
list_start_line
:
if
len
(
list_start
)
>
1
:
if
len
(
list_start
)
>
1
:
for
i
in
range
(
1
,
len
(
list_start
)):
for
i
in
range
(
0
,
len
(
list_start
)):
index
=
list_start
[
i
]
-
1
index
=
list_start
[
i
]
-
1
if
index
>=
0
:
if
"content"
in
lines
[
index
][
"spans"
][
-
1
]:
if
"content"
in
lines
[
index
][
"spans"
][
-
1
]:
lines
[
index
][
"spans"
][
-
1
][
"content"
]
+=
'
\n\n
'
lines
[
index
][
"spans"
][
-
1
][
"content"
]
+=
'
\n\n
'
layout_list_info
=
[
False
,
False
]
# 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
layout_list_info
=
[
False
,
False
]
# 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
...
@@ -388,20 +421,17 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
...
@@ -388,20 +421,17 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
logger
.
info
(
f
"连接page {page_num} 内的list"
)
logger
.
info
(
f
"连接page {page_num} 内的list"
)
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines
=
[]
may_list_lines
=
[]
for
j
in
range
(
len
(
next_paras
)):
lines
=
next_first_para
.
get
(
"lines"
,
[])
lines
=
next_paras
[
j
]
.
get
(
"lines"
,
[])
if
len
(
lines
)
==
1
:
# 只可能是一行,多行情况再需要分析了
for
line
in
lines
:
if
lines
[
0
][
'bbox'
][
0
]
>
__find_layout_bbox_by_line
(
lines
[
0
][
'bbox'
],
new_layout_bbox
)[
0
]:
if
line
[
'bbox'
][
0
]
>
__find_layout_bbox_by_line
(
line
[
'bbox'
],
new_layout_bbox
)[
0
]:
may_list_lines
.
append
(
lines
[
0
])
may_list_lines
.
append
(
line
)
else
:
break
else
:
else
:
break
break
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
if
len
(
may_list_lines
)
>
0
and
len
(
set
([
x
[
'bbox'
][
0
]
for
x
in
may_list_lines
]))
==
1
:
if
len
(
may_list_lines
)
>
0
and
len
(
set
([
x
[
'bbox'
][
0
]
for
x
in
may_list_lines
]))
==
1
:
pre_last_para
.
extend
(
may_list_lines
)
pre_last_para
.
extend
(
may_list_lines
)
blocks_group
[
i
]
=
blocks_group
[
i
][
len
(
may_list_lines
):]
next_first_para
[
"lines"
]
=
next_first_para
[
"lines"
][
len
(
may_list_lines
):]
# layout_paras[i] = layout_paras[i][len(may_list_lines):]
return
blocks_group
,
[
layout_list_info
[
0
][
0
],
layout_list_info
[
-
1
][
1
]]
# 同时还返回了这个页面级别的开头、结尾是不是列表的信息
return
blocks_group
,
[
layout_list_info
[
0
][
0
],
layout_list_info
[
-
1
][
1
]]
# 同时还返回了这个页面级别的开头、结尾是不是列表的信息
...
@@ -422,16 +452,12 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
...
@@ -422,16 +452,12 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
logger
.
info
(
f
"连接page {page_num} 内的list"
)
logger
.
info
(
f
"连接page {page_num} 内的list"
)
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines
=
[]
may_list_lines
=
[]
for
j
in
range
(
len
(
next_page_paras
[
0
])):
next_page_first_para
=
next_page_paras
[
0
][
0
]
next_page_block_j
=
next_page_paras
[
0
][
j
]
if
next_page_first_para
[
"type"
]
==
BlockType
.
Text
:
if
next_page_block_j
[
"type"
]
!=
BlockType
.
Text
:
lines
=
next_page_first_para
[
"lines"
]
break
for
line
in
lines
:
lines
=
next_page_block_j
[
"lines"
]
if
line
[
'bbox'
][
0
]
>
__find_layout_bbox_by_line
(
line
[
'bbox'
],
next_page_layout_bbox
)[
0
]:
if
len
(
lines
)
==
1
:
# 只可能是一行,多行情况再需要分析了
may_list_lines
.
append
(
line
)
if
lines
[
0
][
'bbox'
][
0
]
>
__find_layout_bbox_by_line
(
lines
[
0
][
'bbox'
],
next_page_layout_bbox
)[
0
]:
may_list_lines
.
append
(
lines
[
0
])
else
:
break
else
:
else
:
break
break
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
...
@@ -442,7 +468,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
...
@@ -442,7 +468,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
for
span
in
line
[
"spans"
]:
for
span
in
line
[
"spans"
]:
span
[
CROSS_PAGE
]
=
True
span
[
CROSS_PAGE
]
=
True
pre_page_paras
[
-
1
][
-
1
][
"lines"
]
.
extend
(
may_list_lines
)
pre_page_paras
[
-
1
][
-
1
][
"lines"
]
.
extend
(
may_list_lines
)
next_page_
paras
[
0
]
=
next_page_paras
[
0
][
len
(
may_list_lines
):]
next_page_
first_para
[
"lines"
]
=
next_page_first_para
[
"lines"
][
len
(
may_list_lines
):]
return
True
return
True
return
False
return
False
...
@@ -471,7 +497,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
...
@@ -471,7 +497,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
if
len
(
blocks_group
)
==
0
:
if
len
(
blocks_group
)
==
0
:
return
connected_layout_blocks
return
connected_layout_blocks
#connected_layout_paras.append(layout_paras[0])
connected_layout_blocks
.
append
(
blocks_group
[
0
])
connected_layout_blocks
.
append
(
blocks_group
[
0
])
for
i
in
range
(
1
,
len
(
blocks_group
)):
for
i
in
range
(
1
,
len
(
blocks_group
)):
try
:
try
:
...
@@ -484,6 +509,9 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
...
@@ -484,6 +509,9 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
if
blocks_group
[
i
-
1
][
-
1
][
"type"
]
!=
BlockType
.
Text
or
blocks_group
[
i
][
0
][
"type"
]
!=
BlockType
.
Text
:
if
blocks_group
[
i
-
1
][
-
1
][
"type"
]
!=
BlockType
.
Text
or
blocks_group
[
i
][
0
][
"type"
]
!=
BlockType
.
Text
:
connected_layout_blocks
.
append
(
blocks_group
[
i
])
connected_layout_blocks
.
append
(
blocks_group
[
i
])
continue
continue
if
len
(
blocks_group
[
i
-
1
][
-
1
][
"lines"
])
==
0
or
len
(
blocks_group
[
i
][
0
][
"lines"
])
==
0
:
connected_layout_blocks
.
append
(
blocks_group
[
i
])
continue
pre_last_line
=
blocks_group
[
i
-
1
][
-
1
][
"lines"
][
-
1
]
pre_last_line
=
blocks_group
[
i
-
1
][
-
1
][
"lines"
][
-
1
]
next_first_line
=
blocks_group
[
i
][
0
][
"lines"
][
0
]
next_first_line
=
blocks_group
[
i
][
0
][
"lines"
][
0
]
except
Exception
as
e
:
except
Exception
as
e
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment