Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
f0c463ed
Commit
f0c463ed
authored
Mar 26, 2024
by
许瑞
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of
https://github.com/myhloli/Magic-PDF
parents
efed5faa
3d2fcc9d
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
228 additions
and
83 deletions
+228
-83
ocr_demo.py
demo/ocr_demo.py
+5
-4
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+22
-6
para_split.py
magic_pdf/para/para_split.py
+163
-25
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+35
-46
pipeline.py
magic_pdf/pipeline.py
+3
-2
No files found.
demo/ocr_demo.py
View file @
f0c463ed
...
@@ -90,9 +90,10 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
...
@@ -90,9 +90,10 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
#
pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
#
json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_local_parse(pdf_path, json_file_path)
# ocr_local_parse(pdf_path, json_file_path)
# book_name = "数学新星网/edu_00001236"
book_name
=
"科数网/edu_00011318"
# ocr_online_parse(book_name)
ocr_online_parse
(
book_name
)
pass
pass
magic_pdf/dict2md/ocr_mkcontent.py
View file @
f0c463ed
...
@@ -72,18 +72,26 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
...
@@ -72,18 +72,26 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
markdown
=
[]
markdown
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
():
for
_
,
page_info
in
pdf_info_dict
.
items
():
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
page_markdown
=
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
)
page_markdown
=
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
,
"mm"
)
markdown
.
extend
(
page_markdown
)
markdown
.
extend
(
page_markdown
)
return
'
\n\n
'
.
join
(
markdown
)
return
'
\n\n
'
.
join
(
markdown
)
def
ocr_mk_nlp_markdown_with_para
(
pdf_info_dict
:
dict
):
markdown
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
():
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
page_markdown
=
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
,
"nlp"
)
markdown
.
extend
(
page_markdown
)
return
'
\n\n
'
.
join
(
markdown
)
def
ocr_mk_mm_markdown_with_para_and_pagination
(
pdf_info_dict
:
dict
):
def
ocr_mk_mm_markdown_with_para_and_pagination
(
pdf_info_dict
:
dict
):
markdown_with_para_and_pagination
=
[]
markdown_with_para_and_pagination
=
[]
for
page_no
,
page_info
in
pdf_info_dict
.
items
():
for
page_no
,
page_info
in
pdf_info_dict
.
items
():
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
if
not
paras_of_layout
:
if
not
paras_of_layout
:
continue
continue
page_markdown
=
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
)
page_markdown
=
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
,
"mm"
)
markdown_with_para_and_pagination
.
append
({
markdown_with_para_and_pagination
.
append
({
'page_no'
:
page_no
,
'page_no'
:
page_no
,
'md_content'
:
'
\n\n
'
.
join
(
page_markdown
)
'md_content'
:
'
\n\n
'
.
join
(
page_markdown
)
...
@@ -91,7 +99,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
...
@@ -91,7 +99,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
return
markdown_with_para_and_pagination
return
markdown_with_para_and_pagination
def
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
):
def
ocr_mk_mm_markdown_with_para_core
(
paras_of_layout
,
mode
):
page_markdown
=
[]
page_markdown
=
[]
for
paras
in
paras_of_layout
:
for
paras
in
paras_of_layout
:
for
para
in
paras
:
for
para
in
paras
:
...
@@ -99,6 +107,7 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout):
...
@@ -99,6 +107,7 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout):
for
line
in
para
:
for
line
in
para
:
for
span
in
line
[
'spans'
]:
for
span
in
line
[
'spans'
]:
span_type
=
span
.
get
(
'type'
)
span_type
=
span
.
get
(
'type'
)
content
=
''
if
span_type
==
ContentType
.
Text
:
if
span_type
==
ContentType
.
Text
:
content
=
split_long_words
(
span
[
'content'
])
content
=
split_long_words
(
span
[
'content'
])
# content = span['content']
# content = span['content']
...
@@ -107,9 +116,16 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout):
...
@@ -107,9 +116,16 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout):
elif
span_type
==
ContentType
.
InterlineEquation
:
elif
span_type
==
ContentType
.
InterlineEquation
:
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
content
=
f
"
\n
})
\n
"
if
mode
==
'mm'
:
para_text
+=
content
+
' '
content
=
f
"
\n
})
\n
"
page_markdown
.
append
(
para_text
.
strip
()
+
' '
)
elif
mode
==
'nlp'
:
pass
if
content
!=
''
:
para_text
+=
content
+
' '
if
para_text
.
strip
()
==
''
:
continue
else
:
page_markdown
.
append
(
para_text
.
strip
()
+
' '
)
return
page_markdown
return
page_markdown
...
...
magic_pdf/para/para_split.py
View file @
f0c463ed
This diff is collapsed.
Click to expand it.
magic_pdf/pdf_parse_by_ocr.py
View file @
f0c463ed
...
@@ -57,16 +57,16 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
...
@@ -57,16 +57,16 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
def
parse_pdf_by_ocr
(
def
parse_pdf_by_ocr
(
pdf_path
,
pdf_path
,
s3_pdf_profile
,
s3_pdf_profile
,
pdf_model_output
,
pdf_model_output
,
save_path
,
save_path
,
book_name
,
book_name
,
pdf_model_profile
=
None
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
image_s3_config
=
None
,
start_page_id
=
0
,
start_page_id
=
0
,
end_page_id
=
None
,
end_page_id
=
None
,
debug_mode
=
False
,
debug_mode
=
False
,
):
):
pdf_bytes
=
read_file
(
pdf_path
,
s3_pdf_profile
)
pdf_bytes
=
read_file
(
pdf_path
,
s3_pdf_profile
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
...
@@ -95,7 +95,6 @@ def parse_pdf_by_ocr(
...
@@ -95,7 +95,6 @@ def parse_pdf_by_ocr(
start_time
=
time
.
time
()
start_time
=
time
.
time
()
end_page_id
=
end_page_id
if
end_page_id
else
len
(
pdf_docs
)
-
1
end_page_id
=
end_page_id
if
end_page_id
else
len
(
pdf_docs
)
-
1
for
page_id
in
range
(
start_page_id
,
end_page_id
+
1
):
for
page_id
in
range
(
start_page_id
,
end_page_id
+
1
):
...
@@ -125,13 +124,6 @@ def parse_pdf_by_ocr(
...
@@ -125,13 +124,6 @@ def parse_pdf_by_ocr(
page_id
,
page
,
ocr_page_info
,
md_bookname_save_path
,
debug_mode
=
debug_mode
page_id
,
page
,
ocr_page_info
,
md_bookname_save_path
,
debug_mode
=
debug_mode
)
)
# 构建需要remove的bbox列表
# need_remove_spans_bboxes = []
# need_remove_spans_bboxes.extend(page_no_bboxes)
# need_remove_spans_bboxes.extend(header_bboxes)
# need_remove_spans_bboxes.extend(footer_bboxes)
# need_remove_spans_bboxes.extend(footnote_bboxes)
# 构建需要remove的bbox字典
# 构建需要remove的bbox字典
need_remove_spans_bboxes_dict
=
{
need_remove_spans_bboxes_dict
=
{
DropTag
.
PAGE_NUMBER
:
page_no_bboxes
,
DropTag
.
PAGE_NUMBER
:
page_no_bboxes
,
...
@@ -199,50 +191,48 @@ def parse_pdf_by_ocr(
...
@@ -199,50 +191,48 @@ def parse_pdf_by_ocr(
else
:
else
:
continue
continue
'''删除重叠spans中较小的那些'''
# 删除重叠spans中较小的那些
spans
,
dropped_spans_by_span_overlap
=
remove_overlaps_min_spans
(
spans
)
spans
,
dropped_spans_by_span_overlap
=
remove_overlaps_min_spans
(
spans
)
# 删除remove_span_block_bboxes中的bbox
'''
# spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
删除remove_span_block_bboxes中的bbox
# 按qa要求,增加drop相关数据
并增加drop相关数据
'''
spans
,
dropped_spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
spans
,
dropped_spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
# 对image和table截图
'''对image和table截图'''
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
)
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list
=
[]
displayed_list
=
[]
text_inline_lines
=
[]
text_inline_lines
=
[]
modify_y_axis
(
spans
,
displayed_list
,
text_inline_lines
)
modify_y_axis
(
spans
,
displayed_list
,
text_inline_lines
)
# 模型识别错误的行间公式, type类型转换成行内公式
'''模型识别错误的行间公式, type类型转换成行内公式'''
spans
=
modify_inline_equation
(
spans
,
displayed_list
,
text_inline_lines
)
spans
=
modify_inline_equation
(
spans
,
displayed_list
,
text_inline_lines
)
# bbox去除粘连
'''bbox去除粘连'''
spans
=
remove_overlap_between_bbox
(
spans
)
spans
=
remove_overlap_between_bbox
(
spans
)
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
'''
对tpye=["interline_equation", "image", "table"]进行额外处理,
如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
'''
spans
=
adjust_bbox_for_standalone_block
(
spans
)
spans
=
adjust_bbox_for_standalone_block
(
spans
)
'''从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)'''
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes
,
layout_tree
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
],
page
,
ocr_page_info
)
layout_bboxes
,
layout_tree
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
],
page
,
ocr_page_info
)
# 将spans合并成line(在layout内,从上到下,从左到右)
'''将spans合并成line(在layout内,从上到下,从左到右)'''
lines
,
dropped_spans_by_layout
=
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
)
lines
,
dropped_spans_by_layout
=
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
)
# 将lines合并成block
'''将lines合并成block'''
blocks
=
merge_lines_to_block
(
lines
)
blocks
=
merge_lines_to_block
(
lines
)
# 根据block合并段落
'''获取QA需要外置的list'''
#para_blocks = para_split(blocks, layout_bboxes)
# 获取QA需要外置的list
images
,
tables
,
interline_equations
,
inline_equations
=
get_qa_need_list
(
blocks
)
images
,
tables
,
interline_equations
,
inline_equations
=
get_qa_need_list
(
blocks
)
# drop的span_list合并
'''drop的span_list合并'''
dropped_spans
=
[]
dropped_spans
=
[]
dropped_spans
.
extend
(
dropped_spans_by_span_overlap
)
dropped_spans
.
extend
(
dropped_spans_by_span_overlap
)
dropped_spans
.
extend
(
dropped_spans_by_removed_bboxes
)
dropped_spans
.
extend
(
dropped_spans_by_removed_bboxes
)
...
@@ -263,19 +253,18 @@ def parse_pdf_by_ocr(
...
@@ -263,19 +253,18 @@ def parse_pdf_by_ocr(
elif
span
[
'type'
]
in
[
ContentType
.
InlineEquation
,
ContentType
.
InterlineEquation
]:
elif
span
[
'type'
]
in
[
ContentType
.
InlineEquation
,
ContentType
.
InterlineEquation
]:
dropped_equation_block
.
append
(
span
)
dropped_equation_block
.
append
(
span
)
'''构造pdf_info_dict'''
# 构造pdf_info_dict
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
need_remove_spans_bboxes_dict
)
need_remove_spans_bboxes_dict
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
"""分段"""
"""分段"""
para_split
(
pdf_info_dict
)
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
# 在测试时,保存调试信息
'''在测试时,保存调试信息'''
if
debug_mode
:
if
debug_mode
:
params_file_save_path
=
join_path
(
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
...
...
magic_pdf/pipeline.py
View file @
f0c463ed
...
@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import (
...
@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import (
ocr_mk_nlp_markdown
,
ocr_mk_nlp_markdown
,
ocr_mk_mm_markdown
,
ocr_mk_mm_markdown
,
ocr_mk_mm_standard_format
,
ocr_mk_mm_standard_format
,
ocr_mk_mm_markdown_with_para
,
ocr_mk_mm_markdown_with_para_and_pagination
,
ocr_mk_mm_markdown_with_para
,
ocr_mk_mm_markdown_with_para_and_pagination
,
ocr_mk_nlp_markdown_with_para
,
)
)
from
magic_pdf.libs.commons
import
(
from
magic_pdf.libs.commons
import
(
read_file
,
read_file
,
...
@@ -510,7 +510,8 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False)
...
@@ -510,7 +510,8 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False)
pdf_intermediate_dict
=
jso
[
"pdf_intermediate_dict"
]
pdf_intermediate_dict
=
jso
[
"pdf_intermediate_dict"
]
# 将 pdf_intermediate_dict 解压
# 将 pdf_intermediate_dict 解压
pdf_intermediate_dict
=
JsonCompressor
.
decompress_json
(
pdf_intermediate_dict
)
pdf_intermediate_dict
=
JsonCompressor
.
decompress_json
(
pdf_intermediate_dict
)
markdown_content
=
ocr_mk_mm_markdown_with_para
(
pdf_intermediate_dict
)
# markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
markdown_content
=
ocr_mk_nlp_markdown_with_para
(
pdf_intermediate_dict
)
jso
[
"content"
]
=
markdown_content
jso
[
"content"
]
=
markdown_content
logger
.
info
(
logger
.
info
(
f
"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}"
,
f
"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment