Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
f0c463ed
Commit
f0c463ed
authored
Mar 26, 2024
by
许瑞
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of
https://github.com/myhloli/Magic-PDF
parents
efed5faa
3d2fcc9d
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
228 additions
and
83 deletions
+228
-83
ocr_demo.py
demo/ocr_demo.py
+5
-4
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+22
-6
para_split.py
magic_pdf/para/para_split.py
+163
-25
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+35
-46
pipeline.py
magic_pdf/pipeline.py
+3
-2
No files found.
demo/ocr_demo.py
View file @
f0c463ed
...
...
@@ -90,9 +90,10 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
if
__name__
==
'__main__'
:
#
pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
#
json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_local_parse(pdf_path, json_file_path)
# book_name = "数学新星网/edu_00001236"
# ocr_online_parse(book_name)
book_name
=
"科数网/edu_00011318"
ocr_online_parse
(
book_name
)
pass
magic_pdf/dict2md/ocr_mkcontent.py
View file @
f0c463ed
...
...
@@ -72,18 +72,26 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
markdown
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
():
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
page_markdown
=
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
)
page_markdown
=
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
,
"mm"
)
markdown
.
extend
(
page_markdown
)
return
'
\n\n
'
.
join
(
markdown
)
def
ocr_mk_nlp_markdown_with_para
(
pdf_info_dict
:
dict
):
markdown
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
():
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
page_markdown
=
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
,
"nlp"
)
markdown
.
extend
(
page_markdown
)
return
'
\n\n
'
.
join
(
markdown
)
def
ocr_mk_mm_markdown_with_para_and_pagination
(
pdf_info_dict
:
dict
):
markdown_with_para_and_pagination
=
[]
for
page_no
,
page_info
in
pdf_info_dict
.
items
():
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
if
not
paras_of_layout
:
continue
page_markdown
=
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
)
page_markdown
=
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
,
"mm"
)
markdown_with_para_and_pagination
.
append
({
'page_no'
:
page_no
,
'md_content'
:
'
\n\n
'
.
join
(
page_markdown
)
...
...
@@ -91,7 +99,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
return
markdown_with_para_and_pagination
def
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
):
def
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
,
mode
):
page_markdown
=
[]
for
paras
in
paras_of_layout
:
for
para
in
paras
:
...
...
@@ -99,6 +107,7 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout):
for
line
in
para
:
for
span
in
line
[
'spans'
]:
span_type
=
span
.
get
(
'type'
)
content
=
''
if
span_type
==
ContentType
.
Text
:
content
=
split_long_words
(
span
[
'content'
])
# content = span['content']
...
...
@@ -107,9 +116,16 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout):
elif
span_type
==
ContentType
.
InterlineEquation
:
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
content
=
f
"
\n
})
\n
"
para_text
+=
content
+
' '
page_markdown
.
append
(
para_text
.
strip
()
+
' '
)
if
mode
==
'mm'
:
content
=
f
"
\n
})
\n
"
elif
mode
==
'nlp'
:
pass
if
content
!=
''
:
para_text
+=
content
+
' '
if
para_text
.
strip
()
==
''
:
continue
else
:
page_markdown
.
append
(
para_text
.
strip
()
+
' '
)
return
page_markdown
...
...
magic_pdf/para/para_split.py
View file @
f0c463ed
...
...
@@ -9,7 +9,7 @@ from magic_pdf.libs.ocr_content_type import ContentType
LINE_STOP_FLAG
=
[
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
":"
,
":"
,
")"
,
")"
,
";"
]
INLINE_EQUATION
=
ContentType
.
InlineEquation
INTERLINE_EQUATION
=
ContentType
.
InterlineEquation
TEXT
=
"text"
TEXT
=
ContentType
.
Text
def
__get_span_text
(
span
):
...
...
@@ -20,7 +20,7 @@ def __get_span_text(span):
return
c
def
__detect_list_lines
(
lines
,
new_layout_bboxes
,
lang
=
'en'
):
def
__detect_list_lines
(
lines
,
new_layout_bboxes
,
lang
):
"""
探测是否包含了列表,并且把列表的行分开.
这样的段落特点是,顶格字母大写/数字,紧跟着几行缩进的。缩进的行首字母含小写的。
...
...
@@ -315,11 +315,14 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
return
layout_paras
,
list_info
def
__connect_list_inter_layout
(
layout_paras
,
new_layout_bbox
,
layout_list_info
,
page_num
,
lang
=
"en"
):
def
__connect_list_inter_layout
(
layout_paras
,
new_layout_bbox
,
layout_list_info
,
page_num
,
lang
):
"""
如果上个layout的最后一个段落是列表,下一个layout的第一个段落也是列表,那么将他们连接起来。
如果上个layout的最后一个段落是列表,下一个layout的第一个段落也是列表,那么将他们连接起来。
TODO 因为没有区分列表和段落,所以这个方法暂时不实现。
根据layout_list_info判断是不是列表。,下个layout的第一个段如果不是列表,那么看他们是否有几行都有相同的缩进。
"""
if
len
(
layout_paras
)
==
0
or
len
(
layout_list_info
)
==
0
:
# 0的时候最后的return 会出错
return
layout_paras
,
[
False
,
False
]
for
i
in
range
(
1
,
len
(
layout_paras
)):
pre_layout_list_info
=
layout_list_info
[
i
-
1
]
next_layout_list_info
=
layout_list_info
[
i
]
...
...
@@ -345,7 +348,37 @@ def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info,
pre_last_para
.
extend
(
may_list_lines
)
layout_paras
[
i
]
=
layout_paras
[
i
][
len
(
may_list_lines
):]
return
layout_paras
return
layout_paras
,
[
layout_list_info
[
0
][
0
],
layout_list_info
[
-
1
][
1
]]
# 同时还返回了这个页面级别的开头、结尾是不是列表的信息
def
__connect_list_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
pre_page_list_info
,
next_page_list_info
,
page_num
,
lang
):
"""
如果上个layout的最后一个段落是列表,下一个layout的第一个段落也是列表,那么将他们连接起来。 TODO 因为没有区分列表和段落,所以这个方法暂时不实现。
根据layout_list_info判断是不是列表。,下个layout的第一个段如果不是列表,那么看他们是否有几行都有相同的缩进。
"""
if
len
(
pre_page_paras
)
==
0
or
len
(
next_page_paras
)
==
0
:
# 0的时候最后的return 会出错
return
False
if
pre_page_list_info
[
1
]
and
not
next_page_list_info
[
0
]:
# 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
logger
.
info
(
f
"连接page {page_num} 内的list"
)
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines
=
[]
for
j
in
range
(
len
(
next_page_paras
[
0
])):
line
=
next_page_paras
[
0
][
j
]
if
len
(
line
)
==
1
:
# 只可能是一行,多行情况再需要分析了
if
line
[
0
][
'bbox'
][
0
]
>
__find_layout_bbox_by_line
(
line
[
0
][
'bbox'
],
next_page_layout_bbox
)[
0
]:
may_list_lines
.
append
(
line
[
0
])
else
:
break
else
:
break
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
if
len
(
may_list_lines
)
>
0
and
len
(
set
([
x
[
'bbox'
][
0
]
for
x
in
may_list_lines
]))
==
1
:
pre_page_paras
[
-
1
]
.
append
(
may_list_lines
)
next_page_paras
[
0
]
=
next_page_paras
[
0
][
len
(
may_list_lines
):]
return
True
return
False
def
__find_layout_bbox_by_line
(
line_bbox
,
layout_bboxes
):
...
...
@@ -358,7 +391,7 @@ def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
return
None
def
__connect_para_inter_layoutbox
(
layout_paras
,
new_layout_bbox
,
lang
=
"en"
):
def
__connect_para_inter_layoutbox
(
layout_paras
,
new_layout_bbox
,
lang
):
"""
layout之间进行分段。
主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
...
...
@@ -368,10 +401,19 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang="en"):
"""
connected_layout_paras
=
[]
if
len
(
layout_paras
)
==
0
:
return
connected_layout_paras
connected_layout_paras
.
append
(
layout_paras
[
0
])
for
i
in
range
(
1
,
len
(
layout_paras
)):
pre_last_line
=
layout_paras
[
i
-
1
][
-
1
][
-
1
]
next_first_line
=
layout_paras
[
i
][
0
][
0
]
try
:
if
len
(
layout_paras
[
i
])
==
0
or
len
(
layout_paras
[
i
-
1
])
==
0
:
# TODO 考虑连接问题,
continue
pre_last_line
=
layout_paras
[
i
-
1
][
-
1
][
-
1
]
next_first_line
=
layout_paras
[
i
][
0
][
0
]
except
Exception
as
e
:
logger
.
error
(
f
"page layout {i} has no line"
)
continue
pre_last_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
pre_last_line
[
'spans'
]])
pre_last_line_type
=
pre_last_line
[
'spans'
][
-
1
][
'type'
]
next_first_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
next_first_line
[
'spans'
]])
...
...
@@ -400,7 +442,7 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang="en"):
return
connected_layout_paras
def
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
lang
):
def
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
page_num
,
lang
):
"""
连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。
是否可以连接的条件:
...
...
@@ -408,7 +450,7 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
2. 后一个页面的第一个段落第一行没有空白开头。
"""
# 有的页面可能压根没有文字
if
len
(
pre_page_paras
)
==
0
or
len
(
next_page_paras
)
==
0
:
if
len
(
pre_page_paras
)
==
0
or
len
(
next_page_paras
)
==
0
or
len
(
pre_page_paras
[
0
])
==
0
or
len
(
next_page_paras
[
0
])
==
0
:
# TODO [[]]为什么出现在pre_page_paras里?
return
False
pre_last_para
=
pre_page_paras
[
-
1
][
-
1
]
next_first_para
=
next_page_paras
[
0
][
0
]
...
...
@@ -436,8 +478,85 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
else
:
return
False
def
find_consecutive_true_regions
(
input_array
):
start_index
=
None
# 连续True区域的起始索引
regions
=
[]
# 用于保存所有连续True区域的起始和结束索引
for
i
in
range
(
len
(
input_array
)):
# 如果我们找到了一个True值,并且当前并没有在连续True区域中
if
input_array
[
i
]
and
start_index
is
None
:
start_index
=
i
# 记录连续True区域的起始索引
def
__do_split
(
blocks
,
layout_bboxes
,
new_layout_bbox
,
page_num
,
lang
=
"en"
):
# 如果我们找到了一个False值,并且当前在连续True区域中
elif
not
input_array
[
i
]
and
start_index
is
not
None
:
# 如果连续True区域长度大于1,那么将其添加到结果列表中
if
i
-
start_index
>
1
:
regions
.
append
((
start_index
,
i
-
1
))
start_index
=
None
# 重置起始索引
# 如果最后一个元素是True,那么需要将最后一个连续True区域加入到结果列表中
if
start_index
is
not
None
and
len
(
input_array
)
-
start_index
>
1
:
regions
.
append
((
start_index
,
len
(
input_array
)
-
1
))
return
regions
def
__connect_middle_align_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
,
debug_mode
):
"""
找出来中间对齐的连续单行文本,如果连续行高度相同,那么合并为一个段落。
一个line居中的条件是:
1. 水平中心点跨越layout的中心点。
2. 左右两侧都有空白
"""
for
layout_i
,
layout_para
in
enumerate
(
page_paras
):
layout_box
=
new_layout_bbox
[
layout_i
]
single_line_paras_tag
=
[]
for
i
in
range
(
len
(
layout_para
)):
single_line_paras_tag
.
append
(
len
(
layout_para
[
i
])
==
1
and
layout_para
[
i
][
0
][
'spans'
][
0
][
'type'
]
==
TEXT
)
"""找出来连续的单行文本,如果连续行高度相同,那么合并为一个段落。"""
consecutive_single_line_indices
=
find_consecutive_true_regions
(
single_line_paras_tag
)
if
len
(
consecutive_single_line_indices
)
>
0
:
index_offset
=
0
"""检查这些行是否是高度相同的,居中的"""
for
start
,
end
in
consecutive_single_line_indices
:
start
+=
index_offset
end
+=
index_offset
line_hi
=
np
.
array
([
line
[
0
][
'bbox'
][
3
]
-
line
[
0
][
'bbox'
][
1
]
for
line
in
layout_para
[
start
:
end
+
1
]])
first_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
layout_para
[
start
][
0
][
'spans'
]])
if
"Table"
in
first_line_text
or
"Figure"
in
first_line_text
:
pass
if
debug_mode
:
logger
.
info
(
line_hi
.
std
())
if
line_hi
.
std
()
<
2
:
"""行高度相同,那么判断是否居中"""
all_left_x0
=
[
line
[
0
][
'bbox'
][
0
]
for
line
in
layout_para
[
start
:
end
+
1
]]
all_right_x1
=
[
line
[
0
][
'bbox'
][
2
]
for
line
in
layout_para
[
start
:
end
+
1
]]
layout_center
=
(
layout_box
[
0
]
+
layout_box
[
2
])
/
2
if
all
([
x0
<
layout_center
<
x1
for
x0
,
x1
in
zip
(
all_left_x0
,
all_right_x1
)])
\
and
not
all
([
x0
==
layout_box
[
0
]
for
x0
in
all_left_x0
])
\
and
not
all
([
x1
==
layout_box
[
2
]
for
x1
in
all_right_x1
]):
merge_para
=
[
l
[
0
]
for
l
in
layout_para
[
start
:
end
+
1
]]
para_text
=
''
.
join
([
__get_span_text
(
span
)
for
line
in
merge_para
for
span
in
line
[
'spans'
]])
if
debug_mode
:
logger
.
info
(
para_text
)
layout_para
[
start
:
end
+
1
]
=
[
merge_para
]
index_offset
-=
end
-
start
return
def
__merge_signle_list_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
):
"""
找出来连续的单行文本,如果首行顶格,接下来的几个单行段落缩进对齐,那么合并为一个段落。
"""
pass
def
__do_split_page
(
blocks
,
layout_bboxes
,
new_layout_bbox
,
page_num
,
lang
):
"""
根据line和layout情况进行分段
先实现一个根据行末尾特征分段的简单方法。
...
...
@@ -451,35 +570,54 @@ def __do_split(blocks, layout_bboxes, new_layout_bbox, page_num, lang="en"):
"""
lines_group
=
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
)
# block内分段
layout_paras
,
layout_list_info
=
__split_para_in_layoutbox
(
lines_group
,
new_layout_bbox
,
lang
)
# layout内分段
layout_paras2
=
__connect_list_inter_layout
(
layout_paras
,
new_layout_bbox
,
layout_list_info
,
page_num
,
lang
)
# layout之间连接列表段落
layout_paras2
,
page_list_info
=
__connect_list_inter_layout
(
layout_paras
,
new_layout_bbox
,
layout_list_info
,
page_num
,
lang
)
# layout之间连接列表段落
connected_layout_paras
=
__connect_para_inter_layoutbox
(
layout_paras2
,
new_layout_bbox
,
lang
)
# layout间链接段落
return
connected_layout_paras
def
para_split
(
pdf_info_dict
,
lang
=
"en"
):
return
connected_layout_paras
,
page_list_info
def
para_split
(
pdf_info_dict
,
debug_mode
,
lang
=
"en"
):
"""
根据line和layout情况进行分段
"""
new_layout_of_pages
=
[]
# 数组的数组,每个元素是一个页面的layoutS
all_page_list_info
=
[]
# 保存每个页面开头和结尾是否是列表
for
page_num
,
page
in
pdf_info_dict
.
items
():
blocks
=
page
[
'preproc_blocks'
]
layout_bboxes
=
page
[
'layout_bboxes'
]
new_layout_bbox
=
__common_pre_proc
(
blocks
,
layout_bboxes
)
new_layout_of_pages
.
append
(
new_layout_bbox
)
splited_blocks
=
__do_split
(
blocks
,
layout_bboxes
,
new_layout_bbox
,
page_num
,
lang
)
splited_blocks
,
page_list_info
=
__do_split_page
(
blocks
,
layout_bboxes
,
new_layout_bbox
,
page_num
,
lang
)
all_page_list_info
.
append
(
page_list_info
)
page
[
'para_blocks'
]
=
splited_blocks
"""连接页面与页面之间的可能合并的段落"""
pdf_infos
=
list
(
pdf_info_dict
.
values
())
for
i
,
page
in
enumerate
(
pdf_info_dict
.
values
()):
if
i
==
0
:
for
page_num
,
page
in
enumerate
(
pdf_info_dict
.
values
()):
if
page_num
==
0
:
continue
pre_page_paras
=
pdf_infos
[
i
-
1
][
'para_blocks'
]
next_page_paras
=
pdf_infos
[
i
][
'para_blocks'
]
pre_page_layout_bbox
=
new_layout_of_pages
[
i
-
1
]
next_page_layout_bbox
=
new_layout_of_pages
[
i
]
pre_page_paras
=
pdf_infos
[
page_num
-
1
][
'para_blocks'
]
next_page_paras
=
pdf_infos
[
page_num
][
'para_blocks'
]
pre_page_layout_bbox
=
new_layout_of_pages
[
page_num
-
1
]
next_page_layout_bbox
=
new_layout_of_pages
[
page_num
]
is_conn
=
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
lang
)
if
is_conn
:
logger
.
info
(
f
"连接了第{i-1}页和第{i}页的段落"
)
is_conn
=
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
page_num
,
lang
)
if
debug_mode
:
if
is_conn
:
logger
.
info
(
f
"连接了第{page_num-1}页和第{page_num}页的段落"
)
is_list_conn
=
__connect_list_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
all_page_list_info
[
page_num
-
1
],
all_page_list_info
[
page_num
],
page_num
,
lang
)
if
debug_mode
:
if
is_list_conn
:
logger
.
info
(
f
"连接了第{page_num-1}页和第{page_num}页的列表段落"
)
"""接下来可能会漏掉一些特别的一些可以合并的内容,对他们进行段落连接
1. 正文中有时出现一个行顶格,接下来几行缩进的情况。
2. 居中的一些连续单行,如果高度相同,那么可能是一个段落。
"""
for
page_num
,
page
in
enumerate
(
pdf_info_dict
.
values
()):
page_paras
=
page
[
'para_blocks'
]
new_layout_bbox
=
new_layout_of_pages
[
page_num
]
__connect_middle_align_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
,
debug_mode
=
debug_mode
)
__merge_signle_list_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
)
magic_pdf/pdf_parse_by_ocr.py
View file @
f0c463ed
...
...
@@ -57,16 +57,16 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
def
parse_pdf_by_ocr
(
pdf_path
,
s3_pdf_profile
,
pdf_model_output
,
save_path
,
book_name
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
pdf_path
,
s3_pdf_profile
,
pdf_model_output
,
save_path
,
book_name
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
):
pdf_bytes
=
read_file
(
pdf_path
,
s3_pdf_profile
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
...
...
@@ -95,7 +95,6 @@ def parse_pdf_by_ocr(
start_time
=
time
.
time
()
end_page_id
=
end_page_id
if
end_page_id
else
len
(
pdf_docs
)
-
1
for
page_id
in
range
(
start_page_id
,
end_page_id
+
1
):
...
...
@@ -125,13 +124,6 @@ def parse_pdf_by_ocr(
page_id
,
page
,
ocr_page_info
,
md_bookname_save_path
,
debug_mode
=
debug_mode
)
# 构建需要remove的bbox列表
# need_remove_spans_bboxes = []
# need_remove_spans_bboxes.extend(page_no_bboxes)
# need_remove_spans_bboxes.extend(header_bboxes)
# need_remove_spans_bboxes.extend(footer_bboxes)
# need_remove_spans_bboxes.extend(footnote_bboxes)
# 构建需要remove的bbox字典
need_remove_spans_bboxes_dict
=
{
DropTag
.
PAGE_NUMBER
:
page_no_bboxes
,
...
...
@@ -199,50 +191,48 @@ def parse_pdf_by_ocr(
else
:
continue
# 删除重叠spans中较小的那些
'''删除重叠spans中较小的那些'''
spans
,
dropped_spans_by_span_overlap
=
remove_overlaps_min_spans
(
spans
)
# 删除remove_span_block_bboxes中的bbox
# spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
# 按qa要求,增加drop相关数据
'''
删除remove_span_block_bboxes中的bbox
并增加drop相关数据
'''
spans
,
dropped_spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
# 对image和table截图
'''对image和table截图'''
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list
=
[]
text_inline_lines
=
[]
modify_y_axis
(
spans
,
displayed_list
,
text_inline_lines
)
# 模型识别错误的行间公式, type类型转换成行内公式
'''模型识别错误的行间公式, type类型转换成行内公式'''
spans
=
modify_inline_equation
(
spans
,
displayed_list
,
text_inline_lines
)
# bbox去除粘连
'''bbox去除粘连'''
spans
=
remove_overlap_between_bbox
(
spans
)
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
'''
对tpye=["interline_equation", "image", "table"]进行额外处理,
如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
'''
spans
=
adjust_bbox_for_standalone_block
(
spans
)
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
'''从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)'''
layout_bboxes
,
layout_tree
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
],
page
,
ocr_page_info
)
# 将spans合并成line(在layout内,从上到下,从左到右)
'''将spans合并成line(在layout内,从上到下,从左到右)'''
lines
,
dropped_spans_by_layout
=
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
)
# 将lines合并成block
'''将lines合并成block'''
blocks
=
merge_lines_to_block
(
lines
)
# 根据block合并段落
#para_blocks = para_split(blocks, layout_bboxes)
# 获取QA需要外置的list
'''获取QA需要外置的list'''
images
,
tables
,
interline_equations
,
inline_equations
=
get_qa_need_list
(
blocks
)
# drop的span_list合并
'''drop的span_list合并'''
dropped_spans
=
[]
dropped_spans
.
extend
(
dropped_spans_by_span_overlap
)
dropped_spans
.
extend
(
dropped_spans_by_removed_bboxes
)
...
...
@@ -263,19 +253,18 @@ def parse_pdf_by_ocr(
elif
span
[
'type'
]
in
[
ContentType
.
InlineEquation
,
ContentType
.
InterlineEquation
]:
dropped_equation_block
.
append
(
span
)
# 构造pdf_info_dict
'''构造pdf_info_dict'''
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
need_remove_spans_bboxes_dict
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
"""分段"""
para_split
(
pdf_info_dict
)
# 在测试时,保存调试信息
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
'''在测试时,保存调试信息'''
if
debug_mode
:
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
...
...
magic_pdf/pipeline.py
View file @
f0c463ed
...
...
@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import (
ocr_mk_nlp_markdown
,
ocr_mk_mm_markdown
,
ocr_mk_mm_standard_format
,
ocr_mk_mm_markdown_with_para
,
ocr_mk_mm_markdown_with_para_and_pagination
,
ocr_mk_mm_markdown_with_para
,
ocr_mk_mm_markdown_with_para_and_pagination
,
ocr_mk_nlp_markdown_with_para
,
)
from
magic_pdf.libs.commons
import
(
read_file
,
...
...
@@ -510,7 +510,8 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False)
pdf_intermediate_dict
=
jso
[
"pdf_intermediate_dict"
]
# 将 pdf_intermediate_dict 解压
pdf_intermediate_dict
=
JsonCompressor
.
decompress_json
(
pdf_intermediate_dict
)
markdown_content
=
ocr_mk_mm_markdown_with_para
(
pdf_intermediate_dict
)
# markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
markdown_content
=
ocr_mk_nlp_markdown_with_para
(
pdf_intermediate_dict
)
jso
[
"content"
]
=
markdown_content
logger
.
info
(
f
"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment