Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
f5bde135
Commit
f5bde135
authored
Jul 25, 2024
by
myhloli
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/master'
parents
277e8e1c
c753b41e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
136 additions
and
0 deletions
+136
-0
magic_pdf_parse_main.py
demo/magic_pdf_parse_main.py
+136
-0
No files found.
demo/magic_pdf_parse_main.py
0 → 100644
View file @
f5bde135
import
os
import
json
import
copy
from
loguru
import
logger
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
import
magic_pdf.model
as
model_config
model_config
.
__use_inside_model__
=
True
# todo: 设备类型选择 (?)
def
json_md_dump
(
pipe
,
md_writer
,
pdf_name
,
content_list
,
md_content
,
):
# 写入模型结果到 model.json
orig_model_list
=
copy
.
deepcopy
(
pipe
.
model_list
)
md_writer
.
write
(
content
=
json
.
dumps
(
orig_model_list
,
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"{pdf_name}_model.json"
)
# 写入中间结果到 middle.json
md_writer
.
write
(
content
=
json
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"{pdf_name}_middle.json"
)
# text文本结果写入到 conent_list.json
md_writer
.
write
(
content
=
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"{pdf_name}_content_list.json"
)
# 写入结果到 .md 文件中
md_writer
.
write
(
content
=
md_content
,
path
=
f
"{pdf_name}.md"
)
def
pdf_parse_main
(
pdf_path
:
str
,
parse_method
:
str
=
'auto'
,
model_json_path
:
str
=
None
,
is_json_md_dump
:
bool
=
True
,
output_dir
:
str
=
None
):
"""
执行从 pdf 转换到 json、md 的过程,输出 md 和 json 文件到 pdf 文件所在的目录
:param pdf_path: .pdf 文件的路径,可以是相对路径,也可以是绝对路径
:param parse_method: 解析方法, 共 auto、ocr、txt 三种,默认 auto,如果效果不好,可以尝试 ocr
:param model_json_path: 已经存在的模型数据文件,如果为空则使用内置模型,pdf 和 model_json 务必对应
:param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中,默认 True,会将不同阶段的数据写入到不同的 .json 文件中(共3个.json文件),md内容会保存到 .md 文件中
:param output_dir: 输出结果的目录地址,会生成一个以 pdf 文件名命名的文件夹并保存所有结果
"""
try
:
pdf_name
=
os
.
path
.
basename
(
pdf_path
)
.
split
(
"."
)[
0
]
pdf_path_parent
=
os
.
path
.
dirname
(
pdf_path
)
if
output_dir
:
output_path
=
os
.
path
.
join
(
output_dir
,
pdf_name
)
else
:
output_path
=
os
.
path
.
join
(
pdf_path_parent
,
pdf_name
)
output_image_path
=
os
.
path
.
join
(
output_path
,
'images'
)
# 获取图片的父路径,为的是以相对路径保存到 .md 和 conent_list.json 文件中
image_path_parent
=
os
.
path
.
basename
(
output_image_path
)
pdf_bytes
=
open
(
pdf_path
,
"rb"
)
.
read
()
# 读取 pdf 文件的二进制数据
if
model_json_path
:
# 读取已经被模型解析后的pdf文件的 json 原始数据,list 类型
model_json
=
json
.
loads
(
open
(
model_json_path
,
"r"
,
encoding
=
"utf-8"
)
.
read
())
else
:
model_json
=
[]
# 执行解析步骤
# image_writer = DiskReaderWriter(output_image_path)
image_writer
,
md_writer
=
DiskReaderWriter
(
output_image_path
),
DiskReaderWriter
(
output_path
)
# 选择解析方式
# jso_useful_key = {"_pdf_type": "", "model_list": model_json}
# pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
if
parse_method
==
"auto"
:
jso_useful_key
=
{
"_pdf_type"
:
""
,
"model_list"
:
model_json
}
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
)
elif
parse_method
==
"txt"
:
pipe
=
TXTPipe
(
pdf_bytes
,
model_json
,
image_writer
)
elif
parse_method
==
"ocr"
:
pipe
=
OCRPipe
(
pdf_bytes
,
model_json
,
image_writer
)
else
:
logger
.
error
(
"unknown parse method, only auto, ocr, txt allowed"
)
exit
(
1
)
# 执行分类
pipe
.
pipe_classify
()
# 如果没有传入模型数据,则使用内置模型解析
if
not
model_json
:
if
model_config
.
__use_inside_model__
:
pipe
.
pipe_analyze
()
# 解析
else
:
logger
.
error
(
"need model list input"
)
exit
(
1
)
# 执行解析
pipe
.
pipe_parse
()
# 保存 text 和 md 格式的结果
content_list
=
pipe
.
pipe_mk_uni_format
(
image_path_parent
,
drop_mode
=
"none"
)
md_content
=
pipe
.
pipe_mk_markdown
(
image_path_parent
,
drop_mode
=
"none"
)
if
is_json_md_dump
:
json_md_dump
(
pipe
,
md_writer
,
pdf_name
,
content_list
,
md_content
)
except
Exception
as
e
:
logger
.
exception
(
e
)
# 测试
if
__name__
==
'__main__'
:
pdf_path
=
r"C:\Users\XYTK2\Desktop\2024-2016-gb-cd-300.pdf"
pdf_parse_main
(
pdf_path
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment