Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
b2019af5
Commit
b2019af5
authored
May 06, 2024
by
liukaiwen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
解决标题后空格丢失
parent
69d835c9
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
24 additions
and
3 deletions
+24
-3
Constants.py
magic_pdf/libs/Constants.py
+11
-0
para_split_v2.py
magic_pdf/para/para_split_v2.py
+13
-3
No files found.
magic_pdf/libs/Constants.py
0 → 100644
View file @
b2019af5
"""
span维度自定义字段
"""
# span是否是跨页合并的
CROSS_PAGE
=
"cross_page"
"""
block维度自定义字段
"""
# block中lines是否被删除
LINES_DELETED
=
"lines_deleted"
\ No newline at end of file
magic_pdf/para/para_split_v2.py
View file @
b2019af5
...
@@ -5,6 +5,7 @@ from loguru import logger
...
@@ -5,6 +5,7 @@ from loguru import logger
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap_with_area_ratio
as
is_in_layout
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap_with_area_ratio
as
is_in_layout
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.libs.Constants
import
*
LINE_STOP_FLAG
=
[
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
":"
,
":"
,
")"
,
")"
,
";"
]
LINE_STOP_FLAG
=
[
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
":"
,
":"
,
")"
,
")"
,
";"
]
INLINE_EQUATION
=
ContentType
.
InlineEquation
INLINE_EQUATION
=
ContentType
.
InlineEquation
...
@@ -449,6 +450,10 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
...
@@ -449,6 +450,10 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
if
len
(
may_list_lines
)
>
0
and
len
(
set
([
x
[
'bbox'
][
0
]
for
x
in
may_list_lines
]))
==
1
:
if
len
(
may_list_lines
)
>
0
and
len
(
set
([
x
[
'bbox'
][
0
]
for
x
in
may_list_lines
]))
==
1
:
#pre_page_paras[-1].append(may_list_lines)
#pre_page_paras[-1].append(may_list_lines)
# 下一页合并到上一页最后一段,打一个cross_page的标签
for
line
in
may_list_lines
:
for
span
in
line
:
span
[
CROSS_PAGE
]
=
True
pre_page_paras
[
-
1
][
-
1
][
"lines"
]
.
extend
(
may_list_lines
)
pre_page_paras
[
-
1
][
-
1
][
"lines"
]
.
extend
(
may_list_lines
)
next_page_paras
[
0
]
=
next_page_paras
[
0
][
len
(
may_list_lines
):]
next_page_paras
[
0
]
=
next_page_paras
[
0
][
len
(
may_list_lines
):]
return
True
return
True
...
@@ -518,7 +523,7 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox, lang):
...
@@ -518,7 +523,7 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox, lang):
connected_layout_blocks
[
-
1
][
-
1
][
"lines"
]
.
extend
(
blocks_group
[
i
][
0
][
"lines"
])
connected_layout_blocks
[
-
1
][
-
1
][
"lines"
]
.
extend
(
blocks_group
[
i
][
0
][
"lines"
])
#layout_paras[i].pop(0) # 删除后一个layout的第一个段落, 因为他已经被合并到前一个layout的最后一个段落了。
#layout_paras[i].pop(0) # 删除后一个layout的第一个段落, 因为他已经被合并到前一个layout的最后一个段落了。
blocks_group
[
i
][
0
][
"lines"
]
=
[]
#删除后一个layout第一个段落中的lines,因为他已经被合并到前一个layout的最后一个段落了
blocks_group
[
i
][
0
][
"lines"
]
=
[]
#删除后一个layout第一个段落中的lines,因为他已经被合并到前一个layout的最后一个段落了
blocks_group
[
i
][
0
][
"lines_deleted"
]
=
True
blocks_group
[
i
][
0
][
LINES_DELETED
]
=
True
# if len(layout_paras[i]) == 0:
# if len(layout_paras[i]) == 0:
# layout_paras.pop(i)
# layout_paras.pop(i)
# else:
# else:
...
@@ -571,10 +576,15 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
...
@@ -571,10 +576,15 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
if
pre_last_line
[
'bbox'
][
2
]
==
pre_x2_max
and
pre_last_line_text
[
-
1
]
not
in
LINE_STOP_FLAG
and
\
if
pre_last_line
[
'bbox'
][
2
]
==
pre_x2_max
and
pre_last_line_text
[
-
1
]
not
in
LINE_STOP_FLAG
and
\
next_first_line
[
'bbox'
][
0
]
==
next_x0_min
:
# 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
next_first_line
[
'bbox'
][
0
]
==
next_x0_min
:
# 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
"""连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
"""连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
# 下一页合并到上一页最后一段,打一个cross_page的标签
for
line
in
next_first_para
:
for
span
in
line
:
span
[
CROSS_PAGE
]
=
True
pre_last_para
.
extend
(
next_first_para
)
pre_last_para
.
extend
(
next_first_para
)
#next_page_paras[0].pop(0) # 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。
#next_page_paras[0].pop(0) # 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。
next_page_paras
[
0
][
0
][
"lines"
]
=
[]
next_page_paras
[
0
][
0
][
"lines"
]
=
[]
next_page_paras
[
0
][
0
][
"lines_deleted"
]
=
True
next_page_paras
[
0
][
0
][
LINES_DELETED
]
=
True
return
True
return
True
else
:
else
:
return
False
return
False
...
@@ -647,7 +657,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
...
@@ -647,7 +657,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
layout_para
[
start
][
"lines"
]
=
merge_para
layout_para
[
start
][
"lines"
]
=
merge_para
for
i_para
in
range
(
start
+
1
,
end
+
1
):
for
i_para
in
range
(
start
+
1
,
end
+
1
):
layout_para
[
i_para
][
"lines"
]
=
[]
layout_para
[
i_para
][
"lines"
]
=
[]
layout_para
[
i_para
][
"lines_deleted"
]
=
True
layout_para
[
i_para
][
LINES_DELETED
]
=
True
#layout_para[start:end + 1] = [merge_para]
#layout_para[start:end + 1] = [merge_para]
#index_offset -= end - start
#index_offset -= end - start
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment