Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
7f0c734f
Commit
7f0c734f
authored
Mar 28, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
pipeline重构
parent
872cd73f
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
21 additions
and
19 deletions
+21
-19
pdf2md.py
demo/pdf2md.py
+4
-2
pdf_parse_by_model.py
magic_pdf/pdf_parse_by_model.py
+2
-3
pipeline.py
magic_pdf/pipeline.py
+15
-14
No files found.
demo/pdf2md.py
View file @
7f0c734f
...
...
@@ -5,7 +5,7 @@ from pathlib import Path
import
click
from
loguru
import
logger
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
,
read_file
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
from
magic_pdf.pipeline
import
parse_pdf_by_model
...
...
@@ -21,9 +21,11 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
text_content_save_path
=
f
"{save_path}/{book_name}/book.md"
# metadata_save_path = f"{save_path}/{book_name}/metadata.json"
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_pdf_profile
)
try
:
paras_dict
=
parse_pdf_by_model
(
s3_pdf_path
,
s3_pdf_profile
,
pdf_model_path
,
save_path
,
book_name
,
pdf_model_profile
,
start_page_num
,
debug_mode
=
debug_mode
pdf_bytes
,
pdf_model_path
,
save_path
,
book_name
,
pdf_model_profile
,
start_page_num
,
debug_mode
=
debug_mode
)
parent_dir
=
os
.
path
.
dirname
(
text_content_save_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
...
...
magic_pdf/pdf_parse_by_model.py
View file @
7f0c734f
...
...
@@ -71,8 +71,7 @@ paraMergeException_msg = ParaMergeException().message
def
parse_pdf_by_model
(
s3_pdf_path
,
s3_pdf_profile
,
pdf_bytes
,
pdf_model_output
,
save_path
,
book_name
,
...
...
@@ -83,7 +82,7 @@ def parse_pdf_by_model(
junk_img_bojids
=
[],
debug_mode
=
False
,
):
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_pdf_profile
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
md_bookname_save_path
=
""
book_name
=
sanitize_filename
(
book_name
)
...
...
magic_pdf/pipeline.py
View file @
7f0c734f
...
...
@@ -304,6 +304,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
# 开始正式逻辑
s3_pdf_path
=
jso
.
get
(
"file_location"
)
s3_config
=
get_s3_config
(
s3_pdf_path
)
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_config
)
model_output_json_list
=
jso
.
get
(
"doc_layout_result"
)
data_source
=
get_data_source
(
jso
)
file_id
=
jso
.
get
(
"file_id"
)
...
...
@@ -341,8 +342,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
file
=
sys
.
stderr
,
)
pdf_info_dict
=
parse_pdf_by_model
(
s3_pdf_path
,
s3_config
,
pdf_bytes
,
model_output_json_list
,
save_path
,
book_name
,
...
...
@@ -373,18 +373,6 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
return
jso
"""
统一处理逻辑
1.先调用parse_pdf对文本类pdf进行处理
2.再调用ocr_dropped_parse_pdf,对之前drop的pdf进行处理
"""
def
uni_parse_pdf
(
jso
:
dict
,
start_page_id
=
0
,
debug_mode
=
False
)
->
dict
:
jso
=
parse_pdf
(
jso
,
start_page_id
=
start_page_id
,
debug_mode
=
debug_mode
)
jso
=
ocr_dropped_parse_pdf
(
jso
,
start_page_id
=
start_page_id
,
debug_mode
=
debug_mode
)
return
jso
def
parse_pdf_for_model_train
(
jso
:
dict
,
start_page_id
=
0
,
debug_mode
=
False
)
->
dict
:
# 检测debug开关
if
debug_mode
:
...
...
@@ -465,6 +453,19 @@ def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> d
return
jso
"""
统一处理逻辑
1.先调用parse_pdf对文本类pdf进行处理
2.再调用ocr_dropped_parse_pdf,对之前drop的pdf进行处理
"""
def
uni_parse_pdf
(
jso
:
dict
,
start_page_id
=
0
,
debug_mode
=
False
)
->
dict
:
jso
=
parse_pdf
(
jso
,
start_page_id
=
start_page_id
,
debug_mode
=
debug_mode
)
jso
=
ocr_dropped_parse_pdf
(
jso
,
start_page_id
=
start_page_id
,
debug_mode
=
debug_mode
)
return
jso
# 专门用来跑被drop的pdf,跑完之后需要把need_drop字段置为false
def
ocr_dropped_parse_pdf
(
jso
:
dict
,
start_page_id
=
0
,
debug_mode
=
False
)
->
dict
:
if
not
jso
.
get
(
"need_drop"
,
False
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment