Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
d5e30f8d
Commit
d5e30f8d
authored
Jun 25, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:use deepcopy keep the original model json
parent
8e537ed5
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
7 additions
and
5 deletions
+7
-5
magicpdf.py
magic_pdf/cli/magicpdf.py
+7
-5
No files found.
magic_pdf/cli/magicpdf.py
View file @
d5e30f8d
...
...
@@ -17,8 +17,8 @@
效果:
python magicpdf.py
--json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
python magicpdf.py
json-command --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
python magicpdf.py
pdf-command
--pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
"""
import
os
...
...
@@ -45,6 +45,7 @@ from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
import
csv
import
copy
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
...
...
@@ -81,6 +82,7 @@ def do_parse(
f_dump_orig_pdf
=
True
,
f_dump_content_list
=
True
,
):
orig_model_list
=
copy
.
deepcopy
(
model_list
)
local_image_dir
,
local_md_dir
=
prepare_env
(
pdf_file_name
,
parse_method
)
image_writer
,
md_writer
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
)
...
...
@@ -130,7 +132,7 @@ def do_parse(
if
f_dump_model_json
:
"""写model_json"""
md_writer
.
write
(
content
=
json_parse
.
dumps
(
pipe
.
model_list
,
ensure_ascii
=
False
,
indent
=
4
),
content
=
json_parse
.
dumps
(
orig_
model_list
,
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"{pdf_file_name}_model.json"
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
)
...
...
@@ -143,7 +145,7 @@ def do_parse(
mode
=
AbsReaderWriter
.
MODE_BIN
,
)
content_list
=
pipe
.
pipe_mk_uni_format
(
str
(
image_dir
)
,
drop_mode
=
DropMode
.
NONE
)
content_list
=
pipe
.
pipe_mk_uni_format
(
image_dir
,
drop_mode
=
DropMode
.
NONE
)
if
f_dump_content_list
:
"""写content_list"""
md_writer
.
write
(
...
...
@@ -278,7 +280,7 @@ def pdf_command(pdf, model, method):
model_path
=
pdf
.
replace
(
".pdf"
,
".json"
)
if
not
os
.
path
.
exists
(
model_path
):
logger
.
warning
(
f
"not found json {model_path} existed
, use paddle analyze
"
f
"not found json {model_path} existed"
)
# 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
model_json
=
"[]"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment