Commit 4b8dbd7c authored by 赵小蒙's avatar 赵小蒙

ocr_pdf_intermediate_dict_to_markdown_with_para支持mm和nlp双模式

parent d6a5724b
...@@ -41,7 +41,7 @@ def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict: ...@@ -41,7 +41,7 @@ def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
return jso return jso
def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False) -> dict: def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, mode, debug_mode=False) -> dict:
if debug_mode: if debug_mode:
pass pass
else: # 如果debug没开,则检测是否有needdrop字段 else: # 如果debug没开,则检测是否有needdrop字段
...@@ -54,8 +54,12 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False) ...@@ -54,8 +54,12 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False)
pdf_intermediate_dict = jso["pdf_intermediate_dict"] pdf_intermediate_dict = jso["pdf_intermediate_dict"]
# 将 pdf_intermediate_dict 解压 # 将 pdf_intermediate_dict 解压
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict) pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
# markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
if mode == "mm":
markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
elif mode == "nlp":
markdown_content = ocr_mk_nlp_markdown_with_para(pdf_intermediate_dict) markdown_content = ocr_mk_nlp_markdown_with_para(pdf_intermediate_dict)
jso["content"] = markdown_content jso["content"] = markdown_content
logger.info( logger.info(
f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment