Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
ce96c3f6
Commit
ce96c3f6
authored
Mar 20, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
为ocr模式的demo增加online模式,pipeline进行微调适配online模式
parent
49bf40cc
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
59 additions
and
52 deletions
+59
-52
demo_test.py
demo/demo_test.py
+1
-1
ocr_demo.py
demo/ocr_demo.py
+48
-39
draw_bbox.py
magic_pdf/libs/draw_bbox.py
+8
-10
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+2
-2
No files found.
demo/demo_test.py
View file @
ce96c3f6
...
...
@@ -34,7 +34,7 @@ def get_json_from_local_or_s3(book_name=None):
s3_config
=
get_s3_config
(
json_path
)
file_content
=
read_file
(
json_path
,
s3_config
)
json_str
=
file_content
.
decode
(
"utf-8"
)
logger
.
info
(
json_str
)
#
logger.info(json_str)
json_object
=
json
.
loads
(
json_str
)
return
json_object
...
...
demo/ocr_demo.py
View file @
ce96c3f6
...
...
@@ -4,6 +4,7 @@ import os
from
loguru
import
logger
from
pathlib
import
Path
from
app.common.s3
import
get_s3_config
from
demo.demo_test
import
get_json_from_local_or_s3
from
magic_pdf.dict2md.ocr_mkcontent
import
ocr_mk_mm_markdown_with_para
,
ocr_mk_nlp_markdown
,
ocr_mk_mm_markdown
,
ocr_mk_mm_standard_format
from
magic_pdf.libs.commons
import
join_path
...
...
@@ -35,50 +36,58 @@ def ocr_local_parse(ocr_pdf_path, ocr_json_file_path):
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
book_name
=
pth
.
name
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
save_path
=
join_path
(
save_tmp_path
,
"md"
)
save_path_with_bookname
=
os
.
path
.
join
(
save_path
,
book_name
)
text_content_save_path
=
f
"{save_path_with_bookname}/book.md"
pdf_info_dict
=
parse_pdf_by_ocr
(
ocr_pdf_path
,
None
,
ocr_pdf_model_info
,
save_path
,
book_name
,
debug_mode
=
True
)
parent_dir
=
os
.
path
.
dirname
(
text_content_save_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
# markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content
=
ocr_mk_mm_markdown_with_para
(
pdf_info_dict
)
with
open
(
text_content_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
markdown_content
)
standard_format
=
ocr_mk_mm_standard_format
(
pdf_info_dict
)
standard_format_save_path
=
f
"{save_path_with_bookname}/standard_format.txt"
with
open
(
standard_format_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
str
(
standard_format
))
# logger.info(markdown_content)
# save_markdown(markdown_text, ocr_json_file_path)
ocr_parse_core
(
book_name
,
ocr_pdf_path
,
ocr_pdf_model_info
)
except
Exception
as
e
:
logger
.
exception
(
e
)
def
ocr_online_parse
(
book_name
,
start_page_id
=
0
,
debug_mode
=
True
):
json_object
=
get_json_from_local_or_s3
(
book_name
)
logger
.
info
(
json_object
)
try
:
json_object
=
get_json_from_local_or_s3
(
book_name
)
# logger.info(json_object)
s3_pdf_path
=
json_object
[
"file_location"
]
s3_config
=
get_s3_config
(
s3_pdf_path
)
ocr_pdf_model_info
=
json_object
[
"doc_layout_result"
]
ocr_parse_core
(
book_name
,
s3_pdf_path
,
ocr_pdf_model_info
,
s3_config
=
s3_config
)
except
Exception
as
e
:
logger
.
exception
(
e
)
def
ocr_parse_core
(
book_name
,
ocr_pdf_path
,
ocr_pdf_model_info
,
start_page_id
=
0
,
s3_config
=
None
):
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
save_path
=
join_path
(
save_tmp_path
,
"md"
)
save_path_with_bookname
=
os
.
path
.
join
(
save_path
,
book_name
)
text_content_save_path
=
f
"{save_path_with_bookname}/book.md"
pdf_info_dict
=
parse_pdf_by_ocr
(
ocr_pdf_path
,
s3_config
,
ocr_pdf_model_info
,
save_path
,
book_name
,
debug_mode
=
True
)
parent_dir
=
os
.
path
.
dirname
(
text_content_save_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
# markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content
=
ocr_mk_mm_markdown_with_para
(
pdf_info_dict
)
with
open
(
text_content_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
markdown_content
)
standard_format
=
ocr_mk_mm_standard_format
(
pdf_info_dict
)
standard_format_save_path
=
f
"{save_path_with_bookname}/standard_format.txt"
with
open
(
standard_format_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
str
(
standard_format
))
if
__name__
==
'__main__'
:
#ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
#ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
ocr_pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
ocr_json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
# pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
# json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
# pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
# json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
# ocr_local_parse(pdf_path, json_file_path)
ocr_online_parse
(
book_name
=
"数学新星网/edu_00001236"
)
ocr_local_parse
(
ocr_pdf_path
,
ocr_json_file_path
)
pass
magic_pdf/libs/draw_bbox.py
View file @
ce96c3f6
...
...
@@ -27,7 +27,7 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config):
page
.
insert_text
((
x0
,
y0
),
str
(
j
+
1
),
fontsize
=
10
,
color
=
new_rgb
)
# Insert the index at the top left corner of the rectangle
def
draw_layout_bbox
(
pdf_info_dict
,
input_path
,
out_path
):
def
draw_layout_bbox
(
pdf_info_dict
,
pdf_bytes
,
out_path
):
layout_bbox_list
=
[]
dropped_bbox_list
=
[]
for
page
in
pdf_info_dict
.
values
():
...
...
@@ -40,15 +40,14 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
for
dropped_bbox
in
dropped_bboxes
:
page_dropped_list
.
append
(
dropped_bbox
)
dropped_bbox_list
.
append
(
page_dropped_list
)
doc
=
fitz
.
open
(
input_path
)
for
i
,
page
in
enumerate
(
doc
):
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
for
i
,
page
in
enumerate
(
pdf_docs
):
draw_bbox_with_number
(
i
,
layout_bbox_list
,
page
,
[
255
,
0
,
0
])
draw_bbox_without_number
(
i
,
dropped_bbox_list
,
page
,
[
0
,
255
,
0
])
# Save the PDF
doc
.
save
(
f
"{out_path}/layout.pdf"
)
pdf_docs
.
save
(
f
"{out_path}/layout.pdf"
)
def
draw_text_bbox
(
pdf_info_dict
,
input_path
,
out_path
):
def
draw_text_bbox
(
pdf_info_dict
,
pdf_bytes
,
out_path
):
text_list
=
[]
inline_equation_list
=
[]
interline_equation_list
=
[]
...
...
@@ -68,13 +67,12 @@ def draw_text_bbox(pdf_info_dict, input_path, out_path):
text_list
.
append
(
page_text_list
)
inline_equation_list
.
append
(
page_inline_equation_list
)
interline_equation_list
.
append
(
page_interline_equation_list
)
doc
=
fitz
.
open
(
input_path
)
for
i
,
page
in
enumerate
(
doc
):
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
for
i
,
page
in
enumerate
(
pdf_docs
):
# 获取当前页面的数据
draw_bbox_without_number
(
i
,
text_list
,
page
,
[
255
,
0
,
0
])
draw_bbox_without_number
(
i
,
inline_equation_list
,
page
,
[
0
,
255
,
0
])
draw_bbox_without_number
(
i
,
interline_equation_list
,
page
,
[
0
,
0
,
255
])
# Save the PDF
doc
.
save
(
f
"{out_path}/text.pdf"
)
pdf_docs
.
save
(
f
"{out_path}/text.pdf"
)
magic_pdf/pdf_parse_by_ocr.py
View file @
ce96c3f6
...
...
@@ -282,7 +282,7 @@ def parse_pdf_by_ocr(
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
# drow_bbox
draw_layout_bbox
(
pdf_info_dict
,
pdf_
path
,
md_bookname_save_path
)
draw_text_bbox
(
pdf_info_dict
,
pdf_
path
,
md_bookname_save_path
)
draw_layout_bbox
(
pdf_info_dict
,
pdf_
bytes
,
md_bookname_save_path
)
draw_text_bbox
(
pdf_info_dict
,
pdf_
bytes
,
md_bookname_save_path
)
return
pdf_info_dict
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment