Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
f31117de
Commit
f31117de
authored
Mar 12, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
debug时自动绘制layout区域和text区域
parent
ec1a6ef7
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
83 additions
and
33 deletions
+83
-33
draw_bbox.py
demo/draw_bbox.py
+62
-17
ocr_demo.py
demo/ocr_demo.py
+12
-5
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+9
-11
No files found.
demo/draw_bbox.py
View file @
f31117de
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
# PDF文件路径
def
draw_bbox
(
i
,
bbox_list
,
page
,
rgb_config
):
pdf_path
=
r"D:\project\20231108code-clean\magic_pdf\tmp\unittest\download-pdfs\ocr_1.json.pdf"
new_rgb
=
[]
for
item
in
rgb_config
:
item
=
float
(
item
)
/
255
new_rgb
.
append
(
item
)
page_data
=
bbox_list
[
i
]
for
bbox
in
page_data
:
x0
,
y0
,
x1
,
y1
=
bbox
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
doc
=
fitz
.
open
(
pdf_path
)
# Open the PDF
def
draw_layout_bbox
(
pdf_info_dict
,
input_path
,
out_path
):
# 你的数据
layout_bbox_list
=
[]
data
=
[[(
294.7569528415961
,
776.8430953398889
,
300.8827085852479
,
786.922616502779
),
(
460.1523579201934
,
776.8430953398889
,
509.51874244256345
,
787.2825994014537
)],
[(
294.03627569528413
,
779.7229585292861
,
301.24304715840384
,
788.3625480974777
),
(
85.76058041112454
,
781.882855921334
,
156.74727932285367
,
789.8024796921762
)],
[(
293.6759371221282
,
779.7229585292861
,
301.60338573155985
,
788.7225309961523
),
(
459.43168077388145
,
779.7229585292861
,
508.7980652962515
,
789.8024796921762
)],
[(
295.8379685610641
,
780.0829414279607
,
301.24304715840384
,
788.0025651988029
),
(
85.76058041112454
,
781.5228730226593
,
156.74727932285367
,
790.1624625908509
)],
[(
294.03627569528413
,
779.7229585292861
,
301.60338573155985
,
789.0825138948269
),
(
459.79201934703747
,
779.7229585292861
,
508.4377267230955
,
789.4424967935015
)],
[(
86.4812575574365
,
781.882855921334
,
156.0266021765417
,
789.8024796921762
)],
[(
294.39661426844015
,
779.7229585292861
,
301.24304715840384
,
788.3625480974777
),
(
459.43168077388145
,
779.7229585292861
,
508.7980652962515
,
789.4424967935015
)],
[(
294.03627569528413
,
779.7229585292861
,
301.24304715840384
,
788.3625480974777
),
(
85.76058041112454
,
781.5228730226593
,
156.74727932285367
,
789.8024796921762
)],
[(
294.39661426844015
,
779.7229585292861
,
300.8827085852479
,
788.3625480974777
)]]
for
page
in
pdf_info_dict
.
values
():
page_list
=
[]
for
layout
in
page
[
'layout_bboxes'
]:
page_list
.
append
(
layout
[
'layout_bbox'
])
layout_bbox_list
.
append
(
page_list
)
# 对每个页面进行处理
doc
=
fitz
.
open
(
input_path
)
for
i
,
page
in
enumerate
(
doc
):
for
i
,
page
in
enumerate
(
doc
):
# 获取当前页面的数据
# 获取当前页面的数据
page_data
=
data
[
i
]
page_data
=
layout_bbox_list
[
i
]
for
img
in
page_data
:
for
j
,
bbox
in
enumerate
(
page_data
):
# x0, y0, x1, y1, _ = img
x0
,
y0
,
x1
,
y1
=
bbox
x0
,
y0
,
x1
,
y1
=
img
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
(
1
,
0
,
0
),
fill
=
None
,
width
=
1.5
,
overlay
=
True
)
# Draw the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
(
1
,
0
,
0
),
fill
=
None
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
page
.
insert_text
((
x0
,
y0
),
str
(
j
+
1
),
fontsize
=
10
,
color
=
(
1
,
0
,
0
))
# Insert the index at the top left corner of the rectangle
# Save the PDF
doc
.
save
(
f
"{out_path}/layout.pdf"
)
def
draw_text_bbox
(
pdf_info_dict
,
input_path
,
out_path
):
text_list
=
[]
inline_equation_list
=
[]
displayed_equation_list
=
[]
for
page
in
pdf_info_dict
.
values
():
page_text_list
=
[]
page_inline_equation_list
=
[]
page_displayed_equation_list
=
[]
for
block
in
page
[
'preproc_blocks'
]:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
'text'
:
page_text_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
'inline_equation'
:
page_inline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
'displayed_equation'
:
page_displayed_equation_list
.
append
(
span
[
'bbox'
])
text_list
.
append
(
page_text_list
)
inline_equation_list
.
append
(
page_inline_equation_list
)
displayed_equation_list
.
append
(
page_displayed_equation_list
)
doc
=
fitz
.
open
(
input_path
)
for
i
,
page
in
enumerate
(
doc
):
# 获取当前页面的数据
draw_bbox
(
i
,
text_list
,
page
,
[
255
,
0
,
0
])
draw_bbox
(
i
,
inline_equation_list
,
page
,
[
0
,
255
,
0
])
draw_bbox
(
i
,
displayed_equation_list
,
page
,
[
0
,
0
,
255
])
# Save the PDF
# Save the PDF
doc
.
save
(
r"D:\project\20231108code-clean\magic_pdf\tmp\unittest\download-pdfs\ocr_1.json_new.pdf"
)
doc
.
save
(
f
"{out_path}/text.pdf"
)
\ No newline at end of file
demo/ocr_demo.py
View file @
f31117de
...
@@ -4,7 +4,7 @@ import os
...
@@ -4,7 +4,7 @@ import os
from
loguru
import
logger
from
loguru
import
logger
from
pathlib
import
Path
from
pathlib
import
Path
from
magic_pdf.dict2md.ocr_mkcontent
import
mk_nlp_markdown
from
magic_pdf.dict2md.ocr_mkcontent
import
mk_nlp_markdown
,
mk_mm_markdown
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
...
@@ -30,15 +30,20 @@ def read_json_file(file_path):
...
@@ -30,15 +30,20 @@ def read_json_file(file_path):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
ocr_pdf_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0_org.pdf"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0.json"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
ocr_pdf_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1.json"
try
:
try
:
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
book_name
=
pth
.
name
book_name
=
pth
.
name
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
save_path
=
join_path
(
save_tmp_path
,
"md"
)
save_path
=
join_path
(
save_tmp_path
,
"md"
)
text_content_save_path
=
f
"{save_path}/{book_name}/book.md"
save_path_with_bookname
=
os
.
path
.
join
(
save_path
,
book_name
)
text_content_save_path
=
f
"{save_path_with_bookname}/book.md"
pdf_info_dict
=
parse_pdf_by_ocr
(
pdf_info_dict
=
parse_pdf_by_ocr
(
ocr_pdf_path
,
ocr_pdf_path
,
None
,
None
,
...
@@ -46,11 +51,13 @@ if __name__ == '__main__':
...
@@ -46,11 +51,13 @@ if __name__ == '__main__':
save_path
,
save_path
,
book_name
,
book_name
,
debug_mode
=
True
)
debug_mode
=
True
)
parent_dir
=
os
.
path
.
dirname
(
text_content_save_path
)
parent_dir
=
os
.
path
.
dirname
(
text_content_save_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
os
.
makedirs
(
parent_dir
)
markdown_content
=
mk_nlp_markdown
(
pdf_info_dict
)
# markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content
=
mk_mm_markdown
(
pdf_info_dict
)
with
open
(
text_content_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
with
open
(
text_content_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
markdown_content
)
f
.
write
(
markdown_content
)
...
...
magic_pdf/pdf_parse_by_ocr.py
View file @
f31117de
...
@@ -4,6 +4,7 @@ import time
...
@@ -4,6 +4,7 @@ import time
from
loguru
import
logger
from
loguru
import
logger
from
demo.draw_bbox
import
draw_layout_bbox
,
draw_text_bbox
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
fitz
,
get_img_s3_client
,
get_delta_time
,
get_docx_model_output
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
fitz
,
get_img_s3_client
,
get_delta_time
,
get_docx_model_output
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.libs.safe_filename
import
sanitize_filename
...
@@ -185,14 +186,11 @@ def parse_pdf_by_ocr(
...
@@ -185,14 +186,11 @@ def parse_pdf_by_ocr(
# 在测试时,保存调试信息
# 在测试时,保存调试信息
if
debug_mode
:
if
debug_mode
:
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
)
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
)
page_draw_rect_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"layout.pdf"
)
with
open
(
params_file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
with
open
(
params_file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
# 先检测本地 page_draw_rect_save_path 是否存在,如果存在则删除
# drow_bbox
if
os
.
path
.
exists
(
page_draw_rect_save_path
):
draw_layout_bbox
(
pdf_info_dict
,
pdf_path
,
md_bookname_save_path
)
os
.
remove
(
page_draw_rect_save_path
)
draw_text_bbox
(
pdf_info_dict
,
pdf_path
,
md_bookname_save_path
)
# 绘制bbox和layout到pdf
return
pdf_info_dict
return
pdf_info_dict
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment