Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
88f2245d
Commit
88f2245d
authored
Jun 04, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update cli
parent
bc055266
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
57 additions
and
52 deletions
+57
-52
magicpdf.py
magic_pdf/cli/magicpdf.py
+57
-52
No files found.
magic_pdf/cli/magicpdf.py
View file @
88f2245d
...
...
@@ -27,6 +27,7 @@ import sys
import
click
from
loguru
import
logger
from
pathlib
import
Path
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
,
draw_span_bbox
...
...
@@ -52,7 +53,7 @@ def prepare_env(pdf_file_name, method):
get_local_dir
(),
"magic-pdf"
,
pdf_file_name
,
method
)
local_image_dir
=
os
.
path
.
join
(
local_parent_dir
,
"images"
)
local_image_dir
=
os
.
path
.
join
(
str
(
local_parent_dir
)
,
"images"
)
local_md_dir
=
local_parent_dir
os
.
makedirs
(
local_image_dir
,
exist_ok
=
True
)
os
.
makedirs
(
local_md_dir
,
exist_ok
=
True
)
...
...
@@ -102,6 +103,7 @@ def cli():
@
cli
.
command
()
@
click
.
version_option
(
__version__
,
"--version"
,
"-v"
,
help
=
"显示版本信息"
)
@
click
.
option
(
"--json"
,
type
=
str
,
help
=
"输入一个S3路径"
)
@
click
.
option
(
"--method"
,
...
...
@@ -158,63 +160,65 @@ def json_command(json, method):
)
@
cli
.
command
()
@
click
.
option
(
"--local_json"
,
type
=
str
,
help
=
"输入一个本地jsonl路径"
)
@
click
.
option
(
"--method"
,
type
=
parse_pdf_methods
,
help
=
"指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法"
,
default
=
"auto"
,
)
def
local_json_command
(
local_json
,
method
):
def
read_s3_path
(
s3path
):
bucket
,
key
=
parse_s3path
(
s3path
)
@
cli
.
command
()
@
click
.
version_option
(
__version__
,
"--version"
,
"-v"
,
help
=
"显示版本信息"
)
@
click
.
option
(
"--local_json"
,
type
=
str
,
help
=
"输入一个本地jsonl路径"
)
@
click
.
option
(
"--method"
,
type
=
parse_pdf_methods
,
help
=
"指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法"
,
default
=
"auto"
,
)
def
local_json_command
(
local_json
,
method
):
def
read_s3_path
(
s3path
):
bucket
,
key
=
parse_s3path
(
s3path
)
s3_ak
,
s3_sk
,
s3_endpoint
=
get_s3_config
(
bucket
)
s3_rw
=
S3ReaderWriter
(
s3_ak
,
s3_sk
,
s3_endpoint
,
"auto"
,
remove_non_official_s3_args
(
s3path
)
)
may_range_params
=
parse_s3_range_params
(
s3path
)
if
may_range_params
is
None
or
2
!=
len
(
may_range_params
):
byte_start
,
byte_end
=
0
,
None
else
:
byte_start
,
byte_end
=
int
(
may_range_params
[
0
]),
int
(
may_range_params
[
1
])
byte_end
+=
byte_start
-
1
return
s3_rw
.
read_jsonl
(
remove_non_official_s3_args
(
s3path
),
byte_start
,
byte_end
,
AbsReaderWriter
.
MODE_BIN
,
s3_ak
,
s3_sk
,
s3_endpoint
=
get_s3_config
(
bucket
)
s3_rw
=
S3ReaderWriter
(
s3_ak
,
s3_sk
,
s3_endpoint
,
"auto"
,
remove_non_official_s3_args
(
s3path
)
)
may_range_params
=
parse_s3_range_params
(
s3path
)
if
may_range_params
is
None
or
2
!=
len
(
may_range_params
):
byte_start
,
byte_end
=
0
,
None
else
:
byte_start
,
byte_end
=
int
(
may_range_params
[
0
]),
int
(
may_range_params
[
1
])
byte_end
+=
byte_start
-
1
return
s3_rw
.
read_jsonl
(
remove_non_official_s3_args
(
s3path
),
byte_start
,
byte_end
,
AbsReaderWriter
.
MODE_BIN
,
)
with
open
(
local_json
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
for
json_line
in
f
:
jso
=
json_parse
.
loads
(
json_line
)
s3_file_path
=
jso
.
get
(
"file_location"
)
if
s3_file_path
is
None
:
s3_file_path
=
jso
.
get
(
"path"
)
pdf_file_name
=
Path
(
s3_file_path
)
.
stem
pdf_data
=
read_s3_path
(
s3_file_path
)
local_image_dir
,
local_md_dir
=
prepare_env
(
pdf_file_name
,
method
)
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
)
with
open
(
local_json
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
for
json_line
in
f
:
jso
=
json_parse
.
loads
(
json_line
)
s3_file_path
=
jso
.
get
(
"file_location"
)
if
s3_file_path
is
None
:
s3_file_path
=
jso
.
get
(
"path"
)
pdf_file_name
=
Path
(
s3_file_path
)
.
stem
pdf_data
=
read_s3_path
(
s3_file_path
)
local_image_dir
,
local_md_dir
=
prepare_env
(
pdf_file_name
,
method
)
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
)
_do_parse
(
pdf_file_name
,
pdf_data
,
jso
[
"doc_layout_result"
],
method
,
local_image_rw
,
local_md_rw
,
os
.
path
.
basename
(
local_image_dir
),
local_md_dir
)
_do_parse
(
pdf_file_name
,
pdf_data
,
jso
[
"doc_layout_result"
],
method
,
local_image_rw
,
local_md_rw
,
os
.
path
.
basename
(
local_image_dir
),
local_md_dir
)
@
cli
.
command
()
@
click
.
version_option
(
__version__
,
"--version"
,
"-v"
,
help
=
"显示版本信息"
)
@
click
.
option
(
"--pdf"
,
type
=
click
.
Path
(
exists
=
True
),
required
=
True
,
help
=
"PDF文件的路径"
)
...
...
@@ -256,6 +260,7 @@ def pdf_command(pdf, model, method):
)
if
__name__
==
"__main__"
:
"""
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment