Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
520617dd
Unverified
Commit
520617dd
authored
Apr 29, 2024
by
myhloli
Committed by
GitHub
Apr 29, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #83 from papayalove/master
解决部分list不换行问题
parents
e5adbf93
503b9fad
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
33 additions
and
12 deletions
+33
-12
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+1
-1
para_split_v2.py
magic_pdf/para/para_split_v2.py
+32
-11
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
520617dd
...
...
@@ -11,7 +11,7 @@ import re
def
split_long_words
(
text
):
segments
=
text
.
split
(
' '
)
for
i
in
range
(
len
(
segments
)):
words
=
re
.
findall
(
r'\w+|[^\w
\s
]'
,
segments
[
i
],
re
.
UNICODE
)
words
=
re
.
findall
(
r'\w+|[^\w]'
,
segments
[
i
],
re
.
UNICODE
)
for
j
in
range
(
len
(
words
)):
if
len
(
words
[
j
])
>
15
:
words
[
j
]
=
' '
.
join
(
wordninja
.
split
(
words
[
j
]))
...
...
magic_pdf/para/para_split_v2.py
View file @
520617dd
...
...
@@ -26,6 +26,26 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
这样的段落特点是,顶格字母大写/数字,紧跟着几行缩进的。缩进的行首字母含小写的。
"""
def
find_repeating_patterns2
(
lst
):
indices
=
[]
ones_indices
=
[]
i
=
0
while
i
<
len
(
lst
):
# Loop through the entire list
if
lst
[
i
]
==
1
:
# If we encounter a '1', we might be at the start of a pattern
start
=
i
ones_in_this_interval
=
[
i
]
i
+=
1
# Traverse elements that are 1, 2 or 3, until we encounter something else
while
i
<
len
(
lst
)
and
lst
[
i
]
in
[
1
,
2
,
3
]:
if
lst
[
i
]
==
1
:
ones_in_this_interval
.
append
(
i
)
i
+=
1
if
len
(
ones_in_this_interval
)
>
1
or
(
ones_in_this_interval
and
lst
[
start
+
1
]
in
[
2
,
3
]):
indices
.
append
((
start
,
i
-
1
))
ones_indices
.
append
(
ones_in_this_interval
)
else
:
i
+=
1
return
indices
,
ones_indices
def
find_repeating_patterns
(
lst
):
indices
=
[]
ones_indices
=
[]
...
...
@@ -93,7 +113,7 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
else
:
layout_left
=
layout
[
0
]
if
l
[
'bbox'
][
0
]
==
layout_left
:
if
first_char
.
isupper
()
or
first_char
.
isdigit
():
if
first_char
.
isupper
()
or
first_char
.
isdigit
()
or
not
first_char
.
isalnum
()
:
line_fea_encode
.
append
(
1
)
else
:
line_fea_encode
.
append
(
4
)
...
...
@@ -105,7 +125,7 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
list_indice
,
list_start_idx
=
find_repeating_patterns
(
line_fea_encode
)
list_indice
,
list_start_idx
=
find_repeating_patterns
2
(
line_fea_encode
)
if
len
(
list_indice
)
>
0
:
logger
.
info
(
f
"发现了列表,列表行数:{list_indice}, {list_start_idx}"
)
...
...
@@ -241,17 +261,13 @@ def __group_line_by_layout(blocks, layout_bboxes, lang="en"):
每个layout内的行进行聚合
"""
# 因为只是一个block一行目前, 一个block就是一个段落
lines_group
=
[]
blocks_group
=
[]
for
lyout
in
layout_bboxes
:
lines
=
[
line
for
block
in
blocks
if
block
[
"type"
]
==
BlockType
.
Text
and
is_in_layout
(
block
[
'bbox'
],
lyout
[
'layout_bbox'
])
for
line
in
block
[
'lines'
]]
#
lines = [line for block in blocks if block["type"] == BlockType.Text and is_in_layout(block['bbox'], lyout['layout_bbox']) for line in
#
block['lines']]
blocks_in_layout
=
[
block
for
block
in
blocks
if
is_in_layout
(
block
[
'bbox'
],
lyout
[
'layout_bbox'
])]
lines_group
.
append
(
lines
)
blocks_group
.
append
(
blocks_in_layout
)
return
lines_group
,
blocks_group
return
blocks_group
def
__split_para_in_layoutbox2
(
lines_group
,
new_layout_bbox
,
lang
=
"en"
,
char_avg_len
=
10
):
...
...
@@ -305,7 +321,12 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en", char_avg
"""根据list_range,把lines分成几个部分
"""
for
list_start
in
list_start_line
:
if
len
(
list_start
)
>
1
:
for
i
in
range
(
1
,
len
(
list_start
)):
index
=
list_start
[
i
]
-
1
if
"content"
in
lines
[
index
][
"spans"
][
-
1
]:
lines
[
index
][
"spans"
][
-
1
][
"content"
]
+=
'
\n\n
'
# layout_right = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[2]
# layout_left = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]
para
=
[]
# 元素是line
...
...
@@ -654,7 +675,7 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
3. 参照上述行尾特征进行分段。
4. 图、表,目前独占一行,不考虑分段。
"""
lines_group
,
blocks_group
=
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
)
# block内分段
blocks_group
=
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
)
# block内分段
layout_list_info
=
__split_para_in_layoutbox
(
blocks_group
,
new_layout_bbox
,
lang
)
# layout内分段
blocks_group
,
page_list_info
=
__connect_list_inter_layout
(
blocks_group
,
new_layout_bbox
,
layout_list_info
,
page_num
,
lang
)
# layout之间连接列表段落
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment