Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
4c37e741
Commit
4c37e741
authored
Apr 11, 2024
by
许瑞
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat: support multiple pdf parse method
parent
55cba1f4
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
30 additions
and
5 deletions
+30
-5
magicpdf.py
magic_pdf/cli/magicpdf.py
+30
-5
No files found.
magic_pdf/cli/magicpdf.py
View file @
4c37e741
...
...
@@ -31,12 +31,23 @@ from magic_pdf.libs.path_utils import (
from
magic_pdf.libs.config_reader
import
get_local_dir
from
magic_pdf.io.S3ReaderWriter
import
S3ReaderWriter
,
MODE_BIN
from
magic_pdf.io.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.spark.spark_api
import
parse_union_pdf
from
magic_pdf.spark.spark_api
import
parse_union_pdf
,
parse_txt_pdf
,
parse_ocr_pdf
import
os
import
json
as
json_parse
from
datetime
import
datetime
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
def
get_pdf_parse_method
(
method
):
if
method
==
"ocr"
:
return
parse_ocr_pdf
elif
method
==
"txt"
:
return
parse_txt_pdf
return
parse_union_pdf
def
prepare_env
():
local_parent_dir
=
os
.
path
.
join
(
get_local_dir
(),
"magic-pdf"
,
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d-
%
H-
%
M-
%
S"
)
...
...
@@ -56,7 +67,13 @@ def cli():
@
cli
.
command
()
@
click
.
option
(
"--json"
,
type
=
str
,
help
=
"输入一个S3路径"
)
def
json_command
(
json
):
@
click
.
option
(
"--method"
,
type
=
parse_pdf_methods
,
help
=
"指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法"
,
default
=
"auto"
,
)
def
json_command
(
json
,
method
):
if
not
json
.
startswith
(
"s3://"
):
print
(
"usage: python magipdf.py --json s3://some_bucket/some_path"
)
os
.
exit
(
1
)
...
...
@@ -82,7 +99,8 @@ def json_command(json):
local_image_dir
,
_
=
prepare_env
()
local_image_rw
=
DiskReaderWriter
(
local_image_dir
)
parse_union_pdf
(
pdf_data
,
jso
[
"doc_layout_result"
],
local_image_rw
,
is_debug
=
True
)
parse
=
get_pdf_parse_method
(
method
)
parse
(
pdf_data
,
jso
[
"doc_layout_result"
],
local_image_rw
,
is_debug
=
True
)
@
cli
.
command
()
...
...
@@ -90,7 +108,13 @@ def json_command(json):
"--pdf"
,
type
=
click
.
Path
(
exists
=
True
),
required
=
True
,
help
=
"PDF文件的路径"
)
@
click
.
option
(
"--model"
,
type
=
click
.
Path
(
exists
=
True
),
help
=
"模型的路径"
)
def
pdf_command
(
pdf
,
model
):
@
click
.
option
(
"--method"
,
type
=
parse_pdf_methods
,
help
=
"指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法"
,
default
=
"auto"
,
)
def
pdf_command
(
pdf
,
model
,
method
):
# 这里处理pdf和模型相关的逻辑
if
model
is
None
:
model
=
pdf
.
replace
(
".pdf"
,
".json"
)
...
...
@@ -107,7 +131,8 @@ def pdf_command(pdf, model):
local_image_dir
,
_
=
prepare_env
()
local_image_rw
=
DiskReaderWriter
(
local_image_dir
)
parse_union_pdf
(
pdf_data
,
jso
[
"doc_layout_result"
],
local_image_rw
,
is_debug
=
True
)
parse
=
get_pdf_parse_method
(
method
)
parse
(
pdf_data
,
jso
[
"doc_layout_result"
],
local_image_rw
,
is_debug
=
True
)
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment