Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
f10b4a50
Commit
f10b4a50
authored
Mar 15, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
s3_image_save_path统一配置
parent
b1ac8d03
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
8 additions
and
5 deletions
+8
-5
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+3
-2
commons.py
magic_pdf/libs/commons.py
+2
-0
pipeline.py
magic_pdf/pipeline.py
+3
-3
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
f10b4a50
from
magic_pdf.libs.commons
import
s3_image_save_path
,
join_path
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.ocr_content_type
import
ContentType
...
...
@@ -42,7 +43,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
if
not
span
.
get
(
'image_path'
):
continue
else
:
content
=
f
""
content
=
f
"
})"
else
:
content
=
ocr_escape_special_markdown_char
(
span
[
'content'
])
# 转义特殊符号
if
span
[
'type'
]
==
ContentType
.
InlineEquation
:
...
...
@@ -73,7 +74,7 @@ def mk_mm_markdown2(pdf_info_dict:dict):
elif
span_type
==
ContentType
.
InterlineEquation
:
para_text
+=
f
"$$
\n
{span['content']}
\n
$$ "
elif
span_type
==
ContentType
.
Image
:
para_text
+=
f
"
"
para_text
+=
f
"})
"
markdown
.
append
(
para_text
)
return
'
\n\n
'
.
join
(
markdown
)
...
...
magic_pdf/libs/commons.py
View file @
f10b4a50
...
...
@@ -24,6 +24,8 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
json_dump_path
=
"s3://llm-pdf-text/json_dump/"
s3_image_save_path
=
"s3://mllm-raw-media/pdf2md_img/"
def
get_top_percent_list
(
num_list
,
percent
):
"""
...
...
magic_pdf/pipeline.py
View file @
f10b4a50
...
...
@@ -4,7 +4,7 @@ import time
from
urllib.parse
import
quote
from
magic_pdf.dict2md.ocr_mkcontent
import
ocr_mk_nlp_markdown
,
ocr_mk_mm_markdown
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
parse_bucket_key
,
formatted_time
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
parse_bucket_key
,
formatted_time
,
s3_image_save_path
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.dict2md.mkcontent
import
mk_nlp_markdown
...
...
@@ -287,7 +287,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
# jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
else
:
try
:
save_path
=
"s3://mllm-raw-media/pdf2md_img/"
save_path
=
s3_image_save_path
image_s3_config
=
get_s3_config
(
save_path
)
start_time
=
time
.
time
()
# 记录开始时间
# 先打印一下book_name和解析开始的时间
...
...
@@ -328,7 +328,7 @@ def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
file_id
=
jso
.
get
(
'file_id'
)
book_name
=
f
"{data_source}/{file_id}"
try
:
save_path
=
"s3://mllm-raw-media/pdf2md_img/"
save_path
=
s3_image_save_path
image_s3_config
=
get_s3_config
(
save_path
)
start_time
=
time
.
time
()
# 记录开始时间
# 先打印一下book_name和解析开始的时间
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment