Commit dbe79ba1 authored by 赵小蒙's avatar 赵小蒙

ocr_mk_mm_markdown_with_para_and_pagination逻辑更新

parent f36c2656
......@@ -99,28 +99,29 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
markdown_with_para_and_pagination = []
for page_no, page_info in pdf_info_dict.items():
page_markdown = []
paras = page_info.get("para_blocks")
if not paras:
paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout:
continue
for para in paras:
para_text = ''
for line in para:
for span in line['spans']:
span_type = span.get('type')
if span_type == ContentType.Text:
content = split_long_words(span['content'])
# content = span['content']
elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$"
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]:
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
para_text += content + ' '
page_markdown.append(para_text.strip() + ' ')
for paras in paras_of_layout:
for para in paras:
para_text = ''
for line in para:
for span in line['spans']:
span_type = span.get('type')
if span_type == ContentType.Text:
content = split_long_words(span['content'])
# content = span['content']
elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$"
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]:
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
para_text += content + ' '
page_markdown.append(para_text.strip() + ' ')
markdown_with_para_and_pagination.append({
'page_no': page_no,
'md': '\n\n'.join(page_markdown)
'md_content': '\n\n'.join(page_markdown)
})
return markdown_with_para_and_pagination
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment