Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
8e3beebd
Commit
8e3beebd
authored
Mar 21, 2024
by
kernel.h@qq.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修复index越界错误
parent
439c18f9
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
16 additions
and
8 deletions
+16
-8
ocr_demo.py
demo/ocr_demo.py
+2
-2
para_split.py
magic_pdf/para/para_split.py
+14
-6
No files found.
demo/ocr_demo.py
View file @
8e3beebd
...
...
@@ -92,5 +92,5 @@ if __name__ == '__main__':
ocr_json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
# ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
ocr_local_parse
(
ocr_pdf_path
,
ocr_json_file_path
)
#ocr_online_parse(book_name="数学新星网/edu_00001236
")
#
ocr_local_parse(ocr_pdf_path, ocr_json_file_path)
ocr_online_parse
(
book_name
=
"美国加州中学教材/edu_00000060
"
)
magic_pdf/para/para_split.py
View file @
8e3beebd
...
...
@@ -267,6 +267,7 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
且下一行开头不留空白。
"""
line_group_end_with_list
=
[]
# 这个layout最后是不是列表,用于跨layout列表合并
paras
=
[]
right_tail_distance
=
1.5
*
char_avg_len
for
lines
in
lines_group
:
...
...
@@ -291,7 +292,7 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
layout_right
=
__find_layout_bbox_by_line
(
lines
[
0
][
'bbox'
],
new_layout_bbox
)[
2
]
layout_left
=
__find_layout_bbox_by_line
(
lines
[
0
][
'bbox'
],
new_layout_bbox
)[
0
]
para
=
[]
# 元素是line
is_lines_end_with_list
=
False
for
content_type
,
start
,
end
in
text_segments
:
if
content_type
==
'list'
:
for
i
,
line
in
enumerate
(
lines
[
start
:
end
+
1
]):
...
...
@@ -306,7 +307,7 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
para
=
[]
is_lines_end_with_list
=
True
else
:
for
i
,
line
in
enumerate
(
lines
[
start
:
end
+
1
]):
# 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断一下行结尾特征。
...
...
@@ -334,8 +335,12 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
para
=
[]
is_lines_end_with_list
=
False
line_group_end_with_list
.
append
(
is_lines_end_with_list
)
return
paras
return
paras
,
line_group_end_with_list
def
__find_layout_bbox_by_line
(
line_bbox
,
layout_bboxes
):
...
...
@@ -348,7 +353,7 @@ def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
return
None
def
__connect_para_inter_layoutbox
(
layout_paras
,
new_layout_bbox
,
lang
=
"en"
):
def
__connect_para_inter_layoutbox
(
layout_paras
,
new_layout_bbox
,
l
ine_group_end_with_list
,
l
ang
=
"en"
):
"""
layout之间进行分段。
主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
...
...
@@ -395,6 +400,9 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
1. 前一个页面的最后一个段落最后一行沾满整个行。并且没有结尾符号。
2. 后一个页面的第一个段落第一行没有空白开头。
"""
# 有的页面可能压根没有文字
if
len
(
pre_page_paras
)
==
0
or
len
(
next_page_paras
)
==
0
:
return
False
pre_last_para
=
pre_page_paras
[
-
1
]
next_first_para
=
next_page_paras
[
0
]
pre_last_line
=
pre_last_para
[
-
1
]
...
...
@@ -435,8 +443,8 @@ def __do_split(blocks, layout_bboxes, new_layout_bbox, lang="en"):
4. 图、表,目前独占一行,不考虑分段。
"""
lines_group
=
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
)
# block内分段
layout_paras
=
__split_para_in_layoutbox
(
lines_group
,
new_layout_bbox
,
lang
)
# layout内分段
connected_layout_paras
=
__connect_para_inter_layoutbox
(
layout_paras
,
new_layout_bbox
,
lang
)
# layout间链接段落
layout_paras
,
line_group_end_with_list
=
__split_para_in_layoutbox
(
lines_group
,
new_layout_bbox
,
lang
)
# layout内分段
connected_layout_paras
=
__connect_para_inter_layoutbox
(
layout_paras
,
new_layout_bbox
,
l
ine_group_end_with_list
,
l
ang
)
# layout间链接段落
return
connected_layout_paras
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment