Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
f52c6249
Commit
f52c6249
authored
Apr 08, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
更新路径输入和markdown输出逻辑
parent
ca7059e5
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
10 additions
and
6 deletions
+10
-6
pdf2md.py
demo/pdf2md.py
+10
-6
No files found.
demo/pdf2md.py
View file @
f52c6249
import
json
import
os
import
os
import
sys
import
sys
from
pathlib
import
Path
from
pathlib
import
Path
...
@@ -6,7 +7,7 @@ import click
...
@@ -6,7 +7,7 @@ import click
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.commons
import
join_path
,
read_file
from
magic_pdf.libs.commons
import
join_path
,
read_file
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
,
mk_universal_format
from
magic_pdf.pipeline
import
parse_pdf_by_model
from
magic_pdf.pipeline
import
parse_pdf_by_model
...
@@ -32,7 +33,8 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
...
@@ -32,7 +33,8 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
os
.
makedirs
(
parent_dir
)
os
.
makedirs
(
parent_dir
)
if
not
paras_dict
.
get
(
'need_drop'
):
if
not
paras_dict
.
get
(
'need_drop'
):
markdown_content
=
mk_mm_markdown
(
paras_dict
)
content_list
=
mk_universal_format
(
paras_dict
)
markdown_content
=
mk_mm_markdown
(
content_list
)
else
:
else
:
markdown_content
=
paras_dict
[
'drop_reason'
]
markdown_content
=
paras_dict
[
'drop_reason'
]
...
@@ -70,8 +72,8 @@ def main_shell(pdf_file_path: str, save_path: str):
...
@@ -70,8 +72,8 @@ def main_shell(pdf_file_path: str, save_path: str):
@
click
.
command
()
@
click
.
command
()
@
click
.
option
(
"--pdf-dir"
,
help
=
"
s3上
pdf文件的路径"
)
@
click
.
option
(
"--pdf-dir"
,
help
=
"
本地
pdf文件的路径"
)
@
click
.
option
(
"--model-dir"
,
help
=
"
s3上pdf
文件的路径"
)
@
click
.
option
(
"--model-dir"
,
help
=
"
本地模型
文件的路径"
)
@
click
.
option
(
"--start-page-num"
,
default
=
0
,
help
=
"从第几页开始解析"
)
@
click
.
option
(
"--start-page-num"
,
default
=
0
,
help
=
"从第几页开始解析"
)
def
main_shell2
(
pdf_dir
:
str
,
model_dir
:
str
,
start_page_num
:
int
):
def
main_shell2
(
pdf_dir
:
str
,
model_dir
:
str
,
start_page_num
:
int
):
# 先扫描所有的pdf目录里的文件名字
# 先扫描所有的pdf目录里的文件名字
...
@@ -86,8 +88,10 @@ def main_shell2(pdf_dir: str, model_dir: str,start_page_num: int):
...
@@ -86,8 +88,10 @@ def main_shell2(pdf_dir: str, model_dir: str,start_page_num: int):
for
pdf_file
in
pdf_file_names
:
for
pdf_file
in
pdf_file_names
:
pdf_file_path
=
os
.
path
.
join
(
pdf_dir
,
pdf_file
)
pdf_file_path
=
os
.
path
.
join
(
pdf_dir
,
pdf_file
)
model_file_path
=
os
.
path
.
join
(
model_dir
,
pdf_file
)
model_file_path
=
os
.
path
.
join
(
model_dir
,
pdf_file
)
.
rstrip
(
".pdf"
)
+
".json"
main
(
pdf_file_path
,
None
,
model_file_path
,
None
,
start_page_num
)
with
open
(
model_file_path
,
"r"
)
as
json_file
:
model_list
=
json
.
load
(
json_file
)
main
(
pdf_file_path
,
None
,
model_list
,
None
,
start_page_num
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment