Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
388223f2
Commit
388223f2
authored
Mar 08, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ocr模式下删除header/page number/footnote/footer
parent
fcea39d3
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
179 additions
and
39 deletions
+179
-39
draw_bbox.py
demo/draw_bbox.py
+5
-4
ocr_demo.py
demo/ocr_demo.py
+27
-6
commons.py
magic_pdf/libs/commons.py
+29
-0
pdf_parse_by_model.py
magic_pdf/pdf_parse_by_model.py
+1
-26
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+99
-3
ocr_remove_spans.py
magic_pdf/pre_proc/ocr_remove_spans.py
+18
-0
No files found.
demo/draw_bbox.py
View file @
388223f2
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
# PDF文件路径
# PDF文件路径
pdf_path
=
"D:
\\
project
\\
20231108code-clean
\\
code-clean
\\
tmp
\\
unittest
\\
download-pdfs
\\
scihub
\\
scihub_53700000
\\
libgen.scimag53724000-53724999.zip_10.1097
\\
00129191-200509000-00018
.pdf"
pdf_path
=
r"D:\project\20231108code-clean\magic_pdf\tmp\unittest\download-pdfs\ocr_1.json
.pdf"
doc
=
fitz
.
open
(
pdf_path
)
# Open the PDF
doc
=
fitz
.
open
(
pdf_path
)
# Open the PDF
# 你的数据
# 你的数据
data
=
[[
[
-
2
,
0
,
603
,
80
,
24
]],
[[
-
3
,
0
,
602
,
80
,
24
]
]]
data
=
[[
(
294.7569528415961
,
776.8430953398889
,
300.8827085852479
,
786.922616502779
),
(
460.1523579201934
,
776.8430953398889
,
509.51874244256345
,
787.2825994014537
)],
[(
294.03627569528413
,
779.7229585292861
,
301.24304715840384
,
788.3625480974777
),
(
85.76058041112454
,
781.882855921334
,
156.74727932285367
,
789.8024796921762
)],
[(
293.6759371221282
,
779.7229585292861
,
301.60338573155985
,
788.7225309961523
),
(
459.43168077388145
,
779.7229585292861
,
508.7980652962515
,
789.8024796921762
)],
[(
295.8379685610641
,
780.0829414279607
,
301.24304715840384
,
788.0025651988029
),
(
85.76058041112454
,
781.5228730226593
,
156.74727932285367
,
790.1624625908509
)],
[(
294.03627569528413
,
779.7229585292861
,
301.60338573155985
,
789.0825138948269
),
(
459.79201934703747
,
779.7229585292861
,
508.4377267230955
,
789.4424967935015
)],
[(
86.4812575574365
,
781.882855921334
,
156.0266021765417
,
789.8024796921762
)],
[(
294.39661426844015
,
779.7229585292861
,
301.24304715840384
,
788.3625480974777
),
(
459.43168077388145
,
779.7229585292861
,
508.7980652962515
,
789.4424967935015
)],
[(
294.03627569528413
,
779.7229585292861
,
301.24304715840384
,
788.3625480974777
),
(
85.76058041112454
,
781.5228730226593
,
156.74727932285367
,
789.8024796921762
)],
[(
294.39661426844015
,
779.7229585292861
,
300.8827085852479
,
788.3625480974777
)
]]
# 对每个页面进行处理
# 对每个页面进行处理
for
i
,
page
in
enumerate
(
doc
):
for
i
,
page
in
enumerate
(
doc
):
# 获取当前页面的数据
# 获取当前页面的数据
page_data
=
data
[
i
]
page_data
=
data
[
i
]
for
img
in
page_data
:
for
img
in
page_data
:
x0
,
y0
,
x1
,
y1
,
_
=
img
# x0, y0, x1, y1, _ = img
x0
,
y0
,
x1
,
y1
=
img
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
(
1
,
0
,
0
),
fill
=
None
,
width
=
1.5
,
overlay
=
True
)
# Draw the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
(
1
,
0
,
0
),
fill
=
None
,
width
=
1.5
,
overlay
=
True
)
# Draw the rectangle
# Save the PDF
# Save the PDF
doc
.
save
(
"D:
\\
project
\\
20231108code-clean
\\
code-clean
\\
tmp
\\
unittest
\\
download-pdfs
\\
scihub
\\
scihub_53700000
\\
libgen.scimag53724000-53724999.zip_10.1097
\\
00129191-200509000-00018_new.pdf"
)
doc
.
save
(
r"D:\project\20231108code-clean\magic_pdf\tmp\unittest\download-pdfs\ocr_1.json_new.pdf"
)
\ No newline at end of file
\ No newline at end of file
demo/ocr_demo.py
View file @
388223f2
...
@@ -2,8 +2,10 @@ import json
...
@@ -2,8 +2,10 @@ import json
import
os
import
os
from
loguru
import
logger
from
loguru
import
logger
from
pathlib
import
Path
from
magic_pdf.dict2md.ocr_mkcontent
import
mk_nlp_markdown
from
magic_pdf.dict2md.ocr_mkcontent
import
mk_nlp_markdown
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
...
@@ -28,12 +30,31 @@ def read_json_file(file_path):
...
@@ -28,12 +30,31 @@ def read_json_file(file_path):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_1(3).json"
ocr_pdf_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1.json"
try
:
try
:
ocr_pdf_info
=
read_json_file
(
ocr_json_file_path
)
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
pdf_info_dict
=
parse_pdf_by_ocr
(
ocr_pdf_info
)
pth
=
Path
(
ocr_json_file_path
)
markdown_text
=
mk_nlp_markdown
(
pdf_info_dict
)
book_name
=
pth
.
name
logger
.
info
(
markdown_text
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
save_markdown
(
markdown_text
,
ocr_json_file_path
)
save_path
=
join_path
(
save_tmp_path
,
"md"
)
text_content_save_path
=
f
"{save_path}/{book_name}/book.md"
pdf_info_dict
=
parse_pdf_by_ocr
(
ocr_pdf_path
,
None
,
ocr_pdf_model_info
,
book_name
,
debug_mode
=
True
)
parent_dir
=
os
.
path
.
dirname
(
text_content_save_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
markdown_content
=
mk_nlp_markdown
(
pdf_info_dict
)
with
open
(
text_content_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
markdown_content
)
# logger.info(markdown_content)
# save_markdown(markdown_text, ocr_json_file_path)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
e
)
logger
.
error
(
e
)
magic_pdf/libs/commons.py
View file @
388223f2
import
datetime
import
datetime
import
json
import
os
,
re
,
configparser
import
os
,
re
,
configparser
import
time
import
time
...
@@ -115,6 +116,34 @@ def read_file(pdf_path: str, s3_profile):
...
@@ -115,6 +116,34 @@ def read_file(pdf_path: str, s3_profile):
with
open
(
pdf_path
,
"rb"
)
as
f
:
with
open
(
pdf_path
,
"rb"
)
as
f
:
return
f
.
read
()
return
f
.
read
()
def
get_docx_model_output
(
pdf_model_output
,
pdf_model_s3_profile
,
page_id
):
if
isinstance
(
pdf_model_output
,
str
):
model_output_json_path
=
join_path
(
pdf_model_output
,
f
"page_{page_id + 1}.json"
)
# 模型输出的页面编号从1开始的
if
os
.
path
.
exists
(
model_output_json_path
):
json_from_docx
=
read_file
(
model_output_json_path
,
pdf_model_s3_profile
)
model_output_json
=
json
.
loads
(
json_from_docx
)
else
:
try
:
model_output_json_path
=
join_path
(
pdf_model_output
,
"model.json"
)
with
open
(
model_output_json_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
model_output_json
=
json
.
load
(
f
)
model_output_json
=
model_output_json
[
"doc_layout_result"
][
page_id
]
except
:
s3_model_output_json_path
=
join_path
(
pdf_model_output
,
f
"page_{page_id + 1}.json"
)
s3_model_output_json_path
=
join_path
(
pdf_model_output
,
f
"{page_id}.json"
)
#s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id }.json")
# logger.warning(f"model_output_json_path: {model_output_json_path} not found. try to load from s3: {s3_model_output_json_path}")
s
=
read_file
(
s3_model_output_json_path
,
pdf_model_s3_profile
)
return
json
.
loads
(
s
)
elif
isinstance
(
pdf_model_output
,
list
):
model_output_json
=
pdf_model_output
[
page_id
]
return
model_output_json
def
list_dir
(
dir_path
:
str
,
s3_profile
:
str
):
def
list_dir
(
dir_path
:
str
,
s3_profile
:
str
):
"""
"""
列出dir_path下的所有文件
列出dir_path下的所有文件
...
...
magic_pdf/pdf_parse_by_model.py
View file @
388223f2
...
@@ -2,7 +2,7 @@ import time
...
@@ -2,7 +2,7 @@ import time
# from anyio import Path
# from anyio import Path
from
magic_pdf.libs.commons
import
fitz
,
get_delta_time
,
get_img_s3_client
from
magic_pdf.libs.commons
import
fitz
,
get_delta_time
,
get_img_s3_client
,
get_docx_model_output
import
json
import
json
import
os
import
os
import
math
import
math
...
@@ -68,31 +68,6 @@ paraSplitException_msg = ParaSplitException().message
...
@@ -68,31 +68,6 @@ paraSplitException_msg = ParaSplitException().message
paraMergeException_msg
=
ParaMergeException
()
.
message
paraMergeException_msg
=
ParaMergeException
()
.
message
def
get_docx_model_output
(
pdf_model_output
,
pdf_model_s3_profile
,
page_id
):
if
isinstance
(
pdf_model_output
,
str
):
model_output_json_path
=
join_path
(
pdf_model_output
,
f
"page_{page_id + 1}.json"
)
# 模型输出的页面编号从1开始的
if
os
.
path
.
exists
(
model_output_json_path
):
json_from_docx
=
read_file
(
model_output_json_path
,
pdf_model_s3_profile
)
model_output_json
=
json
.
loads
(
json_from_docx
)
else
:
try
:
model_output_json_path
=
join_path
(
pdf_model_output
,
"model.json"
)
with
open
(
model_output_json_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
model_output_json
=
json
.
load
(
f
)
model_output_json
=
model_output_json
[
"doc_layout_result"
][
page_id
]
except
:
s3_model_output_json_path
=
join_path
(
pdf_model_output
,
f
"page_{page_id + 1}.json"
)
s3_model_output_json_path
=
join_path
(
pdf_model_output
,
f
"{page_id}.json"
)
#s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id }.json")
# logger.warning(f"model_output_json_path: {model_output_json_path} not found. try to load from s3: {s3_model_output_json_path}")
s
=
read_file
(
s3_model_output_json_path
,
pdf_model_s3_profile
)
return
json
.
loads
(
s
)
elif
isinstance
(
pdf_model_output
,
list
):
model_output_json
=
pdf_model_output
[
page_id
]
return
model_output_json
def
parse_pdf_by_model
(
def
parse_pdf_by_model
(
...
...
magic_pdf/pdf_parse_by_ocr.py
View file @
388223f2
import
os
import
time
from
loguru
import
logger
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
fitz
,
get_img_s3_client
,
get_delta_time
,
get_docx_model_output
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.pre_proc.detect_footer_by_model
import
parse_footers
from
magic_pdf.pre_proc.detect_footnote
import
parse_footnotes_by_model
from
magic_pdf.pre_proc.detect_header
import
parse_headers
from
magic_pdf.pre_proc.detect_page_number
import
parse_pageNos
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_dict_merge
import
merge_spans_to_line
,
remove_overlaps_min_spans
from
magic_pdf.pre_proc.ocr_dict_merge
import
merge_spans_to_line
,
remove_overlaps_min_spans
from
magic_pdf.pre_proc.ocr_remove_spans
import
remove_spans_by_bboxes
def
construct_page_component
(
page_id
,
blocks
,
layout_bboxes
):
def
construct_page_component
(
page_id
,
blocks
,
layout_bboxes
):
...
@@ -12,22 +24,100 @@ def construct_page_component(page_id, blocks, layout_bboxes):
...
@@ -12,22 +24,100 @@ def construct_page_component(page_id, blocks, layout_bboxes):
def
parse_pdf_by_ocr
(
def
parse_pdf_by_ocr
(
ocr_pdf_info
,
pdf_path
,
s3_pdf_profile
,
pdf_model_output
,
book_name
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
start_page_id
=
0
,
start_page_id
=
0
,
end_page_id
=
None
,
end_page_id
=
None
,
debug_mode
=
False
,
):
):
pdf_bytes
=
read_file
(
pdf_path
,
s3_pdf_profile
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
book_name
=
sanitize_filename
(
book_name
)
md_bookname_save_path
=
""
if
debug_mode
:
save_path
=
join_path
(
save_tmp_path
,
"md"
)
pdf_local_path
=
join_path
(
save_tmp_path
,
"download-pdfs"
,
book_name
)
if
not
os
.
path
.
exists
(
os
.
path
.
dirname
(
pdf_local_path
)):
# 如果目录不存在,创建它
os
.
makedirs
(
os
.
path
.
dirname
(
pdf_local_path
))
md_bookname_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
)
if
not
os
.
path
.
exists
(
md_bookname_save_path
):
# 如果目录不存在,创建它
os
.
makedirs
(
md_bookname_save_path
)
with
open
(
pdf_local_path
+
".pdf"
,
"wb"
)
as
pdf_file
:
pdf_file
.
write
(
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
# 初始化空的pdf_info_dict
pdf_info_dict
=
{}
pdf_info_dict
=
{}
end_page_id
=
end_page_id
if
end_page_id
else
len
(
ocr_pdf_info
)
-
1
img_s3_client
=
get_img_s3_client
(
save_path
,
image_s3_config
)
start_time
=
time
.
time
()
remove_bboxes
=
[]
end_page_id
=
end_page_id
if
end_page_id
else
len
(
pdf_docs
)
-
1
for
page_id
in
range
(
start_page_id
,
end_page_id
+
1
):
for
page_id
in
range
(
start_page_id
,
end_page_id
+
1
):
ocr_page_info
=
ocr_pdf_info
[
page_id
]
# 获取当前页的page对象
page
=
pdf_docs
[
page_id
]
if
debug_mode
:
time_now
=
time
.
time
()
logger
.
info
(
f
"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
)
start_time
=
time_now
# 获取当前页的模型数据
ocr_page_info
=
get_docx_model_output
(
pdf_model_output
,
pdf_model_profile
,
page_id
)
"""从json中获取每页的页码、页眉、页脚的bbox"""
page_no_bboxes
=
parse_pageNos
(
page_id
,
page
,
ocr_page_info
)
header_bboxes
=
parse_headers
(
page_id
,
page
,
ocr_page_info
)
footer_bboxes
=
parse_footers
(
page_id
,
page
,
ocr_page_info
)
footnote_bboxes
=
parse_footnotes_by_model
(
page_id
,
page
,
ocr_page_info
,
md_bookname_save_path
,
debug_mode
=
debug_mode
)
# 构建需要remove的bbox列表
need_remove_spans_bboxes
=
[]
need_remove_spans_bboxes
.
extend
(
page_no_bboxes
)
need_remove_spans_bboxes
.
extend
(
header_bboxes
)
need_remove_spans_bboxes
.
extend
(
footer_bboxes
)
need_remove_spans_bboxes
.
extend
(
footnote_bboxes
)
remove_bboxes
.
append
(
need_remove_spans_bboxes
)
layout_dets
=
ocr_page_info
[
'layout_dets'
]
layout_dets
=
ocr_page_info
[
'layout_dets'
]
spans
=
[]
spans
=
[]
# 将模型坐标转换成pymu格式下的未缩放坐标
DPI
=
72
# use this resolution
pix
=
page
.
get_pixmap
(
dpi
=
DPI
)
pageL
=
0
pageR
=
int
(
pix
.
w
)
pageU
=
0
pageD
=
int
(
pix
.
h
)
width_from_json
=
ocr_page_info
[
'page_info'
][
'width'
]
height_from_json
=
ocr_page_info
[
'page_info'
][
'height'
]
LR_scaleRatio
=
width_from_json
/
(
pageR
-
pageL
)
UD_scaleRatio
=
height_from_json
/
(
pageD
-
pageU
)
for
layout_det
in
layout_dets
:
for
layout_det
in
layout_dets
:
category_id
=
layout_det
[
'category_id'
]
category_id
=
layout_det
[
'category_id'
]
allow_category_id_list
=
[
1
,
7
,
13
,
14
,
15
]
allow_category_id_list
=
[
1
,
7
,
13
,
14
,
15
]
if
category_id
in
allow_category_id_list
:
if
category_id
in
allow_category_id_list
:
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
layout_det
[
'poly'
]
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
layout_det
[
'poly'
]
x0
=
x0
/
LR_scaleRatio
y0
=
y0
/
UD_scaleRatio
x1
=
x1
/
LR_scaleRatio
y1
=
y1
/
UD_scaleRatio
bbox
=
[
int
(
x0
),
int
(
y0
),
int
(
x1
),
int
(
y1
)]
bbox
=
[
int
(
x0
),
int
(
y0
),
int
(
x1
),
int
(
y1
)]
'''要删除的'''
'''要删除的'''
# 3: 'header', # 页眉
# 3: 'header', # 页眉
...
@@ -48,8 +138,10 @@ def parse_pdf_by_ocr(
...
@@ -48,8 +138,10 @@ def parse_pdf_by_ocr(
}
}
if
category_id
==
1
:
if
category_id
==
1
:
span
[
'type'
]
=
'image'
span
[
'type'
]
=
'image'
elif
category_id
==
7
:
elif
category_id
==
7
:
span
[
'type'
]
=
'table'
span
[
'type'
]
=
'table'
elif
category_id
==
13
:
elif
category_id
==
13
:
span
[
'content'
]
=
layout_det
[
'latex'
]
span
[
'content'
]
=
layout_det
[
'latex'
]
span
[
'type'
]
=
'inline_equation'
span
[
'type'
]
=
'inline_equation'
...
@@ -67,6 +159,9 @@ def parse_pdf_by_ocr(
...
@@ -67,6 +159,9 @@ def parse_pdf_by_ocr(
# 删除重叠spans中较小的那些
# 删除重叠spans中较小的那些
spans
=
remove_overlaps_min_spans
(
spans
)
spans
=
remove_overlaps_min_spans
(
spans
)
# 删除remove_span_block_bboxes中的bbox
spans
=
remove_spans_by_bboxes
(
spans
,
need_remove_spans_bboxes
)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
...
@@ -89,5 +184,6 @@ def parse_pdf_by_ocr(
...
@@ -89,5 +184,6 @@ def parse_pdf_by_ocr(
page_info
=
construct_page_component
(
page_id
,
blocks
,
layout_bboxes
)
page_info
=
construct_page_component
(
page_id
,
blocks
,
layout_bboxes
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
# logger.info(remove_bboxes)
return
pdf_info_dict
return
pdf_info_dict
magic_pdf/pre_proc/ocr_remove_spans.py
0 → 100644
View file @
388223f2
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap
def
remove_spans_by_bboxes
(
spans
,
need_remove_spans_bboxes
):
# 遍历spans, 判断是否在removed_span_block_bboxes中
# 如果是, 则删除该span
# 否则, 保留该span
need_remove_spans
=
[]
for
span
in
spans
:
for
bbox
in
need_remove_spans_bboxes
:
if
_is_in_or_part_overlap
(
span
[
'bbox'
],
bbox
):
need_remove_spans
.
append
(
span
)
break
for
span
in
need_remove_spans
:
spans
.
remove
(
span
)
return
spans
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment