Commit f10b4a50 authored by 赵小蒙's avatar 赵小蒙

s3_image_save_path统一配置

parent b1ac8d03
from magic_pdf.libs.commons import s3_image_save_path, join_path
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.libs.ocr_content_type import ContentType
......@@ -42,7 +43,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
if not span.get('image_path'):
continue
else:
content = f"![](s3://mllm-raw-media/pdf2md_img/{span['image_path']})"
content = f"![]({join_path(s3_image_save_path, span['image_path'])})"
else:
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
if span['type'] == ContentType.InlineEquation:
......@@ -73,7 +74,7 @@ def mk_mm_markdown2(pdf_info_dict:dict):
elif span_type == ContentType.InterlineEquation:
para_text += f"$$\n{span['content']}\n$$ "
elif span_type == ContentType.Image:
para_text += f"![](s3://mllm-raw-media/pdf2md_img/{span['image_path']}) "
para_text += f"![]({join_path(s3_image_save_path, span['image_path'])})"
markdown.append(para_text)
return '\n\n'.join(markdown)
......
......@@ -24,6 +24,8 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
json_dump_path = "s3://llm-pdf-text/json_dump/"
s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/"
def get_top_percent_list(num_list, percent):
"""
......
......@@ -4,7 +4,7 @@ import time
from urllib.parse import quote
from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown
from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time
from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time, s3_image_save_path
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.dict2md.mkcontent import mk_nlp_markdown
......@@ -287,7 +287,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
# jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
else:
try:
save_path = "s3://mllm-raw-media/pdf2md_img/"
save_path = s3_image_save_path
image_s3_config = get_s3_config(save_path)
start_time = time.time() # 记录开始时间
# 先打印一下book_name和解析开始的时间
......@@ -328,7 +328,7 @@ def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
file_id = jso.get('file_id')
book_name = f"{data_source}/{file_id}"
try:
save_path = "s3://mllm-raw-media/pdf2md_img/"
save_path = s3_image_save_path
image_s3_config = get_s3_config(save_path)
start_time = time.time() # 记录开始时间
# 先打印一下book_name和解析开始的时间
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment