Commit 6199e608 authored by 赵小蒙's avatar 赵小蒙

add union_make logic

parent 87ac340a
from loguru import logger from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.libs.language import detect_lang from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
...@@ -319,3 +320,37 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list): ...@@ -319,3 +320,37 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
content = line_to_standard_format(line) content = line_to_standard_format(line)
content_list.append(content) content_list.append(content)
return content_list return content_list
def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_path: str = ""):
output_content = []
for page_info in pdf_info_dict:
if page_info.get("need_drop", False):
drop_reason = page_info.get("drop_reason")
if drop_mode == DropMode.NONE:
pass
elif drop_mode == DropMode.WHOLE_PDF:
raise Exception(f"drop_mode is {DropMode.WHOLE_PDF} , drop_reason is {drop_reason}")
elif drop_mode == DropMode.SINGLE_PAGE:
logger.warning(f"drop_mode is {DropMode.SINGLE_PAGE} , drop_reason is {drop_reason}")
continue
else:
raise Exception(f"drop_mode can not be null")
paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout:
continue
if make_mode == MakeMode.MM_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
output_content.extend(page_markdown)
elif make_mode == MakeMode.NLP_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
output_content.extend(page_markdown)
elif make_mode == MakeMode.STANDARD_FORMAT:
for para_block in paras_of_layout:
para_content = para_to_standard_format_v2(para_block, img_buket_path)
output_content.append(para_content)
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
return '\n\n'.join(output_content)
elif make_mode == MakeMode.STANDARD_FORMAT:
return output_content
class MakeMode:
MM_MD = "mm_markdown"
NLP_MD = "nlp_markdown"
STANDARD_FORMAT = "standard_format"
class DropMode:
WHOLE_PDF = "whole_pdf"
SINGLE_PAGE = "single_page"
NONE = "none"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment