Commit bf8d8e21 authored by 赵小蒙's avatar 赵小蒙

新增ocr_mk_nlp_markdown_with_para方法

parent 744b3f75
...@@ -72,18 +72,26 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict): ...@@ -72,18 +72,26 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
markdown = [] markdown = []
for _, page_info in pdf_info_dict.items(): for _, page_info in pdf_info_dict.items():
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout) page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "mm")
markdown.extend(page_markdown) markdown.extend(page_markdown)
return '\n\n'.join(markdown) return '\n\n'.join(markdown)
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: dict):
markdown = []
for _, page_info in pdf_info_dict.items():
paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "nlp")
markdown.extend(page_markdown)
return '\n\n'.join(markdown)
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict): def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
markdown_with_para_and_pagination = [] markdown_with_para_and_pagination = []
for page_no, page_info in pdf_info_dict.items(): for page_no, page_info in pdf_info_dict.items():
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout: if not paras_of_layout:
continue continue
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout) page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "mm")
markdown_with_para_and_pagination.append({ markdown_with_para_and_pagination.append({
'page_no': page_no, 'page_no': page_no,
'md_content': '\n\n'.join(page_markdown) 'md_content': '\n\n'.join(page_markdown)
...@@ -91,7 +99,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict): ...@@ -91,7 +99,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
return markdown_with_para_and_pagination return markdown_with_para_and_pagination
def ocr_mk_mm_markdown_with_para_core(paras_of_layout): def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
page_markdown = [] page_markdown = []
for paras in paras_of_layout: for paras in paras_of_layout:
for para in paras: for para in paras:
...@@ -107,7 +115,11 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout): ...@@ -107,7 +115,11 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout):
elif span_type == ContentType.InterlineEquation: elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n" content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]: elif span_type in [ContentType.Image, ContentType.Table]:
if mode == 'mm':
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n" content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
elif mode == 'nlp':
pass
if content:
para_text += content + ' ' para_text += content + ' '
page_markdown.append(para_text.strip() + ' ') page_markdown.append(para_text.strip() + ' ')
return page_markdown return page_markdown
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment