Commit dbe79ba1 authored by 赵小蒙's avatar 赵小蒙

ocr_mk_mm_markdown_with_para_and_pagination逻辑更新

parent f36c2656
...@@ -99,28 +99,29 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict): ...@@ -99,28 +99,29 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
markdown_with_para_and_pagination = [] markdown_with_para_and_pagination = []
for page_no, page_info in pdf_info_dict.items(): for page_no, page_info in pdf_info_dict.items():
page_markdown = [] page_markdown = []
paras = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
if not paras: if not paras_of_layout:
continue continue
for para in paras: for paras in paras_of_layout:
para_text = '' for para in paras:
for line in para: para_text = ''
for span in line['spans']: for line in para:
span_type = span.get('type') for span in line['spans']:
if span_type == ContentType.Text: span_type = span.get('type')
content = split_long_words(span['content']) if span_type == ContentType.Text:
# content = span['content'] content = split_long_words(span['content'])
elif span_type == ContentType.InlineEquation: # content = span['content']
content = f"${span['content']}$" elif span_type == ContentType.InlineEquation:
elif span_type == ContentType.InterlineEquation: content = f"${span['content']}$"
content = f"\n$$\n{span['content']}\n$$\n" elif span_type == ContentType.InterlineEquation:
elif span_type in [ContentType.Image, ContentType.Table]: content = f"\n$$\n{span['content']}\n$$\n"
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n" elif span_type in [ContentType.Image, ContentType.Table]:
para_text += content + ' ' content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
page_markdown.append(para_text.strip() + ' ') para_text += content + ' '
page_markdown.append(para_text.strip() + ' ')
markdown_with_para_and_pagination.append({ markdown_with_para_and_pagination.append({
'page_no': page_no, 'page_no': page_no,
'md': '\n\n'.join(page_markdown) 'md_content': '\n\n'.join(page_markdown)
}) })
return markdown_with_para_and_pagination return markdown_with_para_and_pagination
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment