Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
f65be6e0
Commit
f65be6e0
authored
Apr 08, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
pdf_parse_by_model.py ---> pdf_parse_by_txt.py
parent
0f3bfa10
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
5 additions
and
5 deletions
+5
-5
pdf2md.py
demo/pdf2md.py
+2
-2
pdf_parse_by_txt.py
magic_pdf/pdf_parse_by_txt.py
+1
-1
pipeline.py
magic_pdf/pipeline.py
+2
-2
No files found.
demo/pdf2md.py
View file @
f65be6e0
...
@@ -8,7 +8,7 @@ from loguru import logger
...
@@ -8,7 +8,7 @@ from loguru import logger
from
magic_pdf.libs.commons
import
join_path
,
read_file
from
magic_pdf.libs.commons
import
join_path
,
read_file
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
,
mk_universal_format
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
,
mk_universal_format
from
magic_pdf.p
ipeline
import
parse_pdf_by_model
from
magic_pdf.p
df_parse_by_txt
import
parse_pdf_by_txt
...
@@ -25,7 +25,7 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
...
@@ -25,7 +25,7 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_pdf_profile
)
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_pdf_profile
)
try
:
try
:
paras_dict
=
parse_pdf_by_
model
(
paras_dict
=
parse_pdf_by_
txt
(
pdf_bytes
,
pdf_model_path
,
save_path
,
book_name
,
pdf_model_profile
,
start_page_num
,
debug_mode
=
debug_mode
pdf_bytes
,
pdf_model_path
,
save_path
,
book_name
,
pdf_model_profile
,
start_page_num
,
debug_mode
=
debug_mode
)
)
parent_dir
=
os
.
path
.
dirname
(
text_content_save_path
)
parent_dir
=
os
.
path
.
dirname
(
text_content_save_path
)
...
...
magic_pdf/pdf_parse_by_
model
.py
→
magic_pdf/pdf_parse_by_
txt
.py
View file @
f65be6e0
...
@@ -70,7 +70,7 @@ paraMergeException_msg = ParaMergeException().message
...
@@ -70,7 +70,7 @@ paraMergeException_msg = ParaMergeException().message
def
parse_pdf_by_
model
(
def
parse_pdf_by_
txt
(
pdf_bytes
,
pdf_bytes
,
pdf_model_output
,
pdf_model_output
,
save_path
,
save_path
,
...
...
magic_pdf/pipeline.py
View file @
f65be6e0
...
@@ -13,7 +13,7 @@ from magic_pdf.libs.commons import (
...
@@ -13,7 +13,7 @@ from magic_pdf.libs.commons import (
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.dict2md.mkcontent
import
mk_universal_format
from
magic_pdf.dict2md.mkcontent
import
mk_universal_format
from
magic_pdf.pdf_parse_by_
model
import
parse_pdf_by_model
from
magic_pdf.pdf_parse_by_
txt
import
parse_pdf_by_txt
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
loguru
import
logger
from
loguru
import
logger
...
@@ -310,7 +310,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
...
@@ -310,7 +310,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
f
"book_name is:{book_name},start_time is:{formatted_time(start_time)}"
,
f
"book_name is:{book_name},start_time is:{formatted_time(start_time)}"
,
file
=
sys
.
stderr
,
file
=
sys
.
stderr
,
)
)
pdf_info_dict
=
parse_pdf_by_
model
(
pdf_info_dict
=
parse_pdf_by_
txt
(
pdf_bytes
,
pdf_bytes
,
model_output_json_list
,
model_output_json_list
,
save_path
,
save_path
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment