Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
ef0129ad
Commit
ef0129ad
authored
Apr 23, 2024
by
kernel.h@qq.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改pdf的路径
parent
ed40e1d5
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
16 additions
and
11 deletions
+16
-11
magicpdf.py
magic_pdf/cli/magicpdf.py
+16
-11
No files found.
magic_pdf/cli/magicpdf.py
View file @
ef0129ad
...
...
@@ -23,9 +23,9 @@ python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloa
import
os
import
json
as
json_parse
from
datetime
import
datetime
import
click
from
loguru
import
logger
from
pathlib
import
Path
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
...
...
@@ -44,9 +44,9 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
def
prepare_env
():
def
prepare_env
(
pdf_file_name
):
local_parent_dir
=
os
.
path
.
join
(
get_local_dir
(),
"magic-pdf"
,
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d"
)
get_local_dir
(),
"magic-pdf"
,
pdf_file_name
)
local_image_dir
=
os
.
path
.
join
(
local_parent_dir
,
"images"
)
...
...
@@ -56,7 +56,7 @@ def prepare_env():
return
local_image_dir
,
local_md_dir
def
_do_parse
(
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
def
_do_parse
(
pdf_
file_name
,
pdf_
bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
if
parse_method
==
"auto"
:
pipe
=
UNIPipe
(
pdf_bytes
,
model_list
,
image_writer
,
image_dir
,
is_debug
=
True
)
elif
parse_method
==
"txt"
:
...
...
@@ -70,13 +70,13 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
pipe
.
pipe_classify
()
pipe
.
pipe_parse
()
md_content
=
pipe
.
pipe_mk_markdown
()
part_file_name
=
datetime
.
now
()
.
strftime
(
"
%
H-
%
M-
%
S"
)
#
part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer
.
write
(
content
=
md_content
,
path
=
f
"{p
art
_file_name}.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
content
=
md_content
,
path
=
f
"{p
df
_file_name}.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
)
md_writer
.
write
(
content
=
json_parse
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"{p
art
_file_name}.json"
,
path
=
f
"{p
df
_file_name}.json"
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
)
# try:
...
...
@@ -127,14 +127,17 @@ def json_command(json, method):
)
jso
=
json_parse
.
loads
(
read_s3_path
(
json
)
.
decode
(
"utf-8"
))
pdf_data
=
read_s3_path
(
jso
[
"file_location"
])
local_image_dir
,
local_md_dir
=
prepare_env
()
s3_file_path
=
jso
[
"file_location"
]
pdf_file_name
=
Path
(
s3_file_path
)
.
stem
pdf_data
=
read_s3_path
(
s3_file_path
)
local_image_dir
,
local_md_dir
=
prepare_env
(
pdf_file_name
)
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
)
_do_parse
(
pdf_file_name
,
pdf_data
,
jso
[
"doc_layout_result"
],
method
,
...
...
@@ -169,11 +172,13 @@ def pdf_command(pdf, model, method):
pdf_data
=
read_fn
(
pdf
)
jso
=
json_parse
.
loads
(
read_fn
(
model
)
.
decode
(
"utf-8"
))
local_image_dir
,
local_md_dir
=
prepare_env
()
pdf_file_name
=
Path
(
pdf
)
.
stem
local_image_dir
,
local_md_dir
=
prepare_env
(
pdf_file_name
)
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
)
_do_parse
(
pdf_file_name
,
pdf_data
,
jso
,
method
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment