Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
2acd1ecc
Commit
2acd1ecc
authored
Mar 20, 2024
by
liusilu
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of
https://github.com/myhloli/Magic-PDF
parents
2fb4b2ef
d2cb75e8
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
147 additions
and
44 deletions
+147
-44
ocr_demo.py
demo/ocr_demo.py
+18
-8
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+13
-0
para_split.py
magic_pdf/para/para_split.py
+82
-31
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+6
-4
pipeline.py
magic_pdf/pipeline.py
+28
-1
No files found.
demo/ocr_demo.py
View file @
2acd1ecc
...
@@ -4,6 +4,7 @@ import os
...
@@ -4,6 +4,7 @@ import os
from
loguru
import
logger
from
loguru
import
logger
from
pathlib
import
Path
from
pathlib
import
Path
from
demo.demo_test
import
get_json_from_local_or_s3
from
magic_pdf.dict2md.ocr_mkcontent
import
ocr_mk_mm_markdown_with_para
,
ocr_mk_nlp_markdown
,
ocr_mk_mm_markdown
,
ocr_mk_mm_standard_format
from
magic_pdf.dict2md.ocr_mkcontent
import
ocr_mk_mm_markdown_with_para
,
ocr_mk_nlp_markdown
,
ocr_mk_mm_markdown
,
ocr_mk_mm_standard_format
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
...
@@ -29,14 +30,7 @@ def read_json_file(file_path):
...
@@ -29,14 +30,7 @@ def read_json_file(file_path):
return
data
return
data
if
__name__
==
'__main__'
:
def
ocr_local_parse
(
ocr_pdf_path
,
ocr_json_file_path
):
#ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
#ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
ocr_pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
ocr_json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
try
:
try
:
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
...
@@ -72,3 +66,19 @@ if __name__ == '__main__':
...
@@ -72,3 +66,19 @@ if __name__ == '__main__':
# save_markdown(markdown_text, ocr_json_file_path)
# save_markdown(markdown_text, ocr_json_file_path)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
exception
(
e
)
logger
.
exception
(
e
)
def
ocr_online_parse
(
book_name
,
start_page_id
=
0
,
debug_mode
=
True
):
json_object
=
get_json_from_local_or_s3
(
book_name
)
logger
.
info
(
json_object
)
if
__name__
==
'__main__'
:
#ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
#ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
ocr_pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
ocr_json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
ocr_online_parse
(
book_name
=
"数学新星网/edu_00001236"
)
ocr_local_parse
(
ocr_pdf_path
,
ocr_json_file_path
)
pass
magic_pdf/dict2md/ocr_mkcontent.py
View file @
2acd1ecc
...
@@ -79,6 +79,19 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
...
@@ -79,6 +79,19 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
return
'
\n\n
'
.
join
(
markdown
)
return
'
\n\n
'
.
join
(
markdown
)
def
make_standard_format_with_para
(
pdf_info_dict
:
dict
):
content_list
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
():
paras
=
page_info
.
get
(
"para_blocks"
)
if
not
paras
:
continue
for
para
in
paras
:
for
line
in
para
:
content
=
line_to_standard_format
(
line
)
content_list
.
append
(
content
)
return
content_list
def
line_to_standard_format
(
line
):
def
line_to_standard_format
(
line
):
line_text
=
""
line_text
=
""
inline_equation_num
=
0
inline_equation_num
=
0
...
...
magic_pdf/para/para_split.py
View file @
2acd1ecc
...
@@ -142,48 +142,51 @@ def __group_line_by_layout(blocks, layout_bboxes, lang="en"):
...
@@ -142,48 +142,51 @@ def __group_line_by_layout(blocks, layout_bboxes, lang="en"):
return
lines_group
return
lines_group
def
__split_para_in_layoutbox
(
lines_group
,
layout_bboxes
,
lang
=
"en"
,
char_avg_len
=
10
):
def
__split_para_in_layoutbox
(
lines_group
,
new_layout_bbox
,
lang
=
"en"
,
char_avg_len
=
10
):
"""
"""
lines_group 进行行分段——layout内部进行分段。
lines_group 进行行分段——layout内部进行分段。
lines_group内每个元素是一个Layoutbox内的所有行。
1. 先计算每个group的左右边界。
1. 先计算每个group的左右边界。
2. 然后根据行末尾特征进行分段。
2. 然后根据行末尾特征进行分段。
末尾特征:以句号等结束符结尾。并且距离右侧边界有一定距离。
末尾特征:以句号等结束符结尾。并且距离右侧边界有一定距离。
且下一行开头不留空白。
"""
"""
paras
=
[]
paras
=
[]
right_tail_distance
=
1.5
*
char_avg_len
right_tail_distance
=
1.5
*
char_avg_len
for
lines
in
lines_group
:
for
lines
in
lines_group
:
if
len
(
lines
)
==
0
:
total_lines
=
len
(
lines
)
if
total_lines
<=
1
:
# 0行无需处理。1行无法分段。
continue
continue
layout_right
=
max
([
line
[
'bbox'
][
2
]
for
line
in
lines
])
#layout_right = max([line['bbox'][2] for line in lines])
layout_right
=
__find_layout_bbox_by_line
(
lines
[
0
][
'bbox'
],
new_layout_bbox
)[
2
]
para
=
[]
# 元素是line
para
=
[]
# 元素是line
for
line
in
lines
:
line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
line
[
'spans'
]])
for
i
,
line
in
enumerate
(
lines
):
#logger.info(line_text)
# 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断一下行结尾特征。
last_span_type
=
line
[
'spans'
][
-
1
][
'type'
]
if
last_span_type
in
[
TEXT
,
INLINE_EQUATION
]:
cur_line_type
=
line
[
'spans'
][
-
1
][
'type'
]
last_char
=
line
[
'spans'
][
-
1
][
'content'
][
-
1
]
#cur_line_last_char = line['spans'][-1]['content'][-1]
if
last_char
in
LINE_STOP_FLAG
or
line
[
'bbox'
][
2
]
<
layout_right
-
right_tail_distance
:
next_line
=
lines
[
i
+
1
]
if
i
<
total_lines
-
1
else
None
if
cur_line_type
in
[
TEXT
,
INLINE_EQUATION
]:
if
line
[
'bbox'
][
2
]
<
layout_right
-
right_tail_distance
:
para
.
append
(
line
)
para
.
append
(
line
)
paras
.
append
(
para
)
paras
.
append
(
para
)
# para_text = ''.join([span['content'] for line in para for span in line['spans']])
# logger.info(para_text)
para
=
[]
para
=
[]
elif
line
[
'bbox'
][
2
]
>=
layout_right
-
right_tail_distance
and
next_line
and
next_line
[
'bbox'
][
0
]
==
layout_right
:
# 现在这行到了行尾沾满,下一行存在且顶格。
para
.
append
(
line
)
else
:
else
:
para
.
append
(
line
)
para
.
append
(
line
)
else
:
# 其他,图片、表格、行间公式,各自占一段
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
paras
.
append
(
para
)
para
=
[]
para
=
[]
else
:
else
:
# 其他,图片、表格、行间公式,各自占一段
paras
.
append
([
line
])
if
len
(
para
)
>
0
:
# 先把之前的段落加入到结果中
paras
.
append
(
para
)
para
=
[]
para
=
[]
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
paras
.
append
([
line
])
# 再把当前行加入到结果中。当前行为行间公式、图、表等。
# logger.info(para_text)
para
=
[]
if
len
(
para
)
>
0
:
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
paras
.
append
(
para
)
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# logger.info(para_text)
para
=
[]
para
=
[]
return
paras
return
paras
...
@@ -239,6 +242,40 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang="en"):
...
@@ -239,6 +242,40 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang="en"):
return
connected_layout_paras
return
connected_layout_paras
def
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
lang
):
"""
连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。
是否可以连接的条件:
1. 前一个页面的最后一个段落最后一行沾满整个行。并且没有结尾符号。
2. 后一个页面的第一个段落第一行没有空白开头。
"""
pre_last_para
=
pre_page_paras
[
-
1
]
next_first_para
=
next_page_paras
[
0
]
pre_last_line
=
pre_last_para
[
-
1
]
next_first_line
=
next_first_para
[
0
]
pre_last_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
pre_last_line
[
'spans'
]])
pre_last_line_type
=
pre_last_line
[
'spans'
][
-
1
][
'type'
]
next_first_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
next_first_line
[
'spans'
]])
next_first_line_type
=
next_first_line
[
'spans'
][
0
][
'type'
]
if
pre_last_line_type
not
in
[
TEXT
,
INLINE_EQUATION
]
or
next_first_line_type
not
in
[
TEXT
,
INLINE_EQUATION
]:
# TODO,真的要做好,要考虑跨table, image, 行间的情况
# 不是文本,不连接
return
False
pre_x2_max
=
__find_layout_bbox_by_line
(
pre_last_line
[
'bbox'
],
pre_page_layout_bbox
)[
2
]
next_x0_min
=
__find_layout_bbox_by_line
(
next_first_line
[
'bbox'
],
next_page_layout_bbox
)[
0
]
pre_last_line_text
=
pre_last_line_text
.
strip
()
next_first_line_text
=
next_first_line_text
.
strip
()
if
pre_last_line
[
'bbox'
][
2
]
==
pre_x2_max
and
pre_last_line_text
[
-
1
]
not
in
LINE_STOP_FLAG
and
next_first_line
[
'bbox'
][
0
]
==
next_x0_min
:
# 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
"""连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
pre_page_paras
[
-
1
]
.
extend
(
next_first_para
)
next_page_paras
.
pop
(
0
)
# 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。
return
True
else
:
return
False
def
__do_split
(
blocks
,
layout_bboxes
,
new_layout_bbox
,
lang
=
"en"
):
def
__do_split
(
blocks
,
layout_bboxes
,
new_layout_bbox
,
lang
=
"en"
):
"""
"""
根据line和layout情况进行分段
根据line和layout情况进行分段
...
@@ -252,20 +289,34 @@ def __do_split(blocks, layout_bboxes, new_layout_bbox, lang="en"):
...
@@ -252,20 +289,34 @@ def __do_split(blocks, layout_bboxes, new_layout_bbox, lang="en"):
4. 图、表,目前独占一行,不考虑分段。
4. 图、表,目前独占一行,不考虑分段。
"""
"""
lines_group
=
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
)
# block内分段
lines_group
=
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
)
# block内分段
layout_paras
=
__split_para_in_layoutbox
(
lines_group
,
layout_bboxes
,
lang
)
# layout内分段
layout_paras
=
__split_para_in_layoutbox
(
lines_group
,
new_layout_bbox
,
lang
)
# layout内分段
connected_layout_paras
=
__connect_para_inter_layoutbox
(
layout_paras
,
new_layout_bbox
,
lang
)
# layout间链接段落
connected_layout_paras
=
__connect_para_inter_layoutbox
(
layout_paras
,
new_layout_bbox
,
lang
)
# layout间链接段落
# TODO 不同页面连接
return
connected_layout_paras
return
connected_layout_paras
def
para_split
(
blocks
,
layout_bboxes
,
lang
=
"en"
):
def
para_split
(
pdf_info_dict
,
lang
=
"en"
):
"""
"""
根据line和layout情况进行分段
根据line和layout情况进行分段
"""
"""
new_layout_bbox
=
__common_pre_proc
(
blocks
,
layout_bboxes
)
new_layout_of_pages
=
[]
# 数组的数组,每个元素是一个页面的layoutS
splited_blocks
=
__do_split
(
blocks
,
layout_bboxes
,
new_layout_bbox
,
lang
)
for
_
,
page
in
pdf_info_dict
.
items
():
blocks
=
page
[
'preproc_blocks'
]
return
splited_blocks
layout_bboxes
=
page
[
'layout_bboxes'
]
new_layout_bbox
=
__common_pre_proc
(
blocks
,
layout_bboxes
)
new_layout_of_pages
.
append
(
new_layout_bbox
)
splited_blocks
=
__do_split
(
blocks
,
layout_bboxes
,
new_layout_bbox
,
lang
)
page
[
'para_blocks'
]
=
splited_blocks
"""连接页面与页面之间的可能合并的段落"""
pdf_infos
=
list
(
pdf_info_dict
.
values
())
for
i
,
page
in
enumerate
(
pdf_info_dict
.
values
()):
if
i
==
0
:
continue
pre_page_paras
=
pdf_infos
[
i
-
1
][
'para_blocks'
]
next_page_paras
=
pdf_infos
[
i
][
'para_blocks'
]
pre_page_layout_bbox
=
new_layout_of_pages
[
i
-
1
]
next_page_layout_bbox
=
new_layout_of_pages
[
i
]
is_conn
=
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
lang
)
if
is_conn
:
logger
.
info
(
f
"连接了第{i-1}页和第{i}页的段落"
)
magic_pdf/pdf_parse_by_ocr.py
View file @
2acd1ecc
...
@@ -33,13 +33,12 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo
...
@@ -33,13 +33,12 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
def
construct_page_component
(
blocks
,
para_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
def
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
need_remove_spans_bboxes_dict
):
need_remove_spans_bboxes_dict
):
return_dict
=
{
return_dict
=
{
'preproc_blocks'
:
blocks
,
'preproc_blocks'
:
blocks
,
"para_blocks"
:
para_blocks
,
# 分好段落的blocks
'layout_bboxes'
:
layout_bboxes
,
'layout_bboxes'
:
layout_bboxes
,
'page_idx'
:
page_id
,
'page_idx'
:
page_id
,
'page_size'
:
[
page_w
,
page_h
],
'page_size'
:
[
page_w
,
page_h
],
...
@@ -238,7 +237,7 @@ def parse_pdf_by_ocr(
...
@@ -238,7 +237,7 @@ def parse_pdf_by_ocr(
blocks
=
merge_lines_to_block
(
lines
)
blocks
=
merge_lines_to_block
(
lines
)
# 根据block合并段落
# 根据block合并段落
para_blocks
=
para_split
(
blocks
,
layout_bboxes
)
#
para_blocks = para_split(blocks, layout_bboxes)
# 获取QA需要外置的list
# 获取QA需要外置的list
images
,
tables
,
interline_equations
,
inline_equations
=
get_qa_need_list
(
blocks
)
images
,
tables
,
interline_equations
,
inline_equations
=
get_qa_need_list
(
blocks
)
...
@@ -267,12 +266,15 @@ def parse_pdf_by_ocr(
...
@@ -267,12 +266,15 @@ def parse_pdf_by_ocr(
# 构造pdf_info_dict
# 构造pdf_info_dict
page_info
=
construct_page_component
(
blocks
,
para_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
need_remove_spans_bboxes_dict
)
need_remove_spans_bboxes_dict
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
"""分段"""
para_split
(
pdf_info_dict
)
# 在测试时,保存调试信息
# 在测试时,保存调试信息
if
debug_mode
:
if
debug_mode
:
params_file_save_path
=
join_path
(
params_file_save_path
=
join_path
(
...
...
magic_pdf/pipeline.py
View file @
2acd1ecc
...
@@ -3,7 +3,8 @@ import sys
...
@@ -3,7 +3,8 @@ import sys
import
time
import
time
from
urllib.parse
import
quote
from
urllib.parse
import
quote
from
magic_pdf.dict2md.ocr_mkcontent
import
ocr_mk_nlp_markdown
,
ocr_mk_mm_markdown
,
ocr_mk_mm_standard_format
from
magic_pdf.dict2md.ocr_mkcontent
import
ocr_mk_nlp_markdown
,
ocr_mk_mm_markdown
,
ocr_mk_mm_standard_format
,
\
ocr_mk_mm_markdown_with_para
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
parse_bucket_key
,
formatted_time
,
s3_image_save_path
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
parse_bucket_key
,
formatted_time
,
s3_image_save_path
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.libs.json_compressor
import
JsonCompressor
...
@@ -407,6 +408,32 @@ def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
...
@@ -407,6 +408,32 @@ def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
return
jso
return
jso
def
ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa
(
jso
:
dict
,
debug_mode
=
False
)
->
dict
:
if
debug_mode
:
pass
else
:
# 如果debug没开,则检测是否有needdrop字段
if
jso
.
get
(
'need_drop'
,
False
):
book_name
=
join_path
(
get_data_source
(
jso
),
jso
[
'file_id'
])
logger
.
info
(
f
"book_name is:{book_name} need drop"
,
file
=
sys
.
stderr
)
jso
[
"dropped"
]
=
True
return
jso
try
:
pdf_intermediate_dict
=
jso
[
'pdf_intermediate_dict'
]
# 将 pdf_intermediate_dict 解压
pdf_intermediate_dict
=
JsonCompressor
.
decompress_json
(
pdf_intermediate_dict
)
markdown_content
=
ocr_mk_mm_markdown_with_para
(
pdf_intermediate_dict
)
jso
[
"content_ocr"
]
=
markdown_content
logger
.
info
(
f
"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}"
,
file
=
sys
.
stderr
)
# 把无用的信息清空
jso
[
"doc_layout_result"
]
=
""
jso
[
"pdf_intermediate_dict"
]
=
""
jso
[
"pdf_meta"
]
=
""
except
Exception
as
e
:
jso
=
exception_handler
(
jso
,
e
)
return
jso
def
ocr_pdf_intermediate_dict_to_standard_format
(
jso
:
dict
,
debug_mode
=
False
)
->
dict
:
def
ocr_pdf_intermediate_dict_to_standard_format
(
jso
:
dict
,
debug_mode
=
False
)
->
dict
:
if
debug_mode
:
if
debug_mode
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment