Commit 7631907f authored by 赵小蒙's avatar 赵小蒙

fix interline_equations block

parent 351a3ce1
...@@ -95,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): ...@@ -95,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
page_markdown = [] page_markdown = []
for para_block in paras_of_layout: for para_block in paras_of_layout:
para_text = '' para_text = ''
para_type = para_block.get('type') para_type = para_block['type']
if para_type == BlockType.Text: if para_type == BlockType.Text:
para_text = merge_para_with_text(para_block) para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Title: elif para_type == BlockType.Title:
...@@ -106,32 +106,30 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): ...@@ -106,32 +106,30 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
if mode == 'nlp': if mode == 'nlp':
continue continue
elif mode == 'mm': elif mode == 'mm':
img_blocks = para_block.get('blocks') for block in para_block['blocks']:
for img_block in img_blocks: if block['type'] == BlockType.ImageBody:
if img_block.get('type') == BlockType.ImageBody: for line in block['lines']:
for line in img_block.get('lines'):
for span in line['spans']: for span in line['spans']:
if span.get('type') == ContentType.Image: if span['type'] == ContentType.Image:
para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n" para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
for img_block in img_blocks: for block in para_block['blocks']:
if img_block.get('type') == BlockType.ImageCaption: if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(img_block) para_text += merge_para_with_text(block)
elif para_type == BlockType.Table: elif para_type == BlockType.Table:
if mode == 'nlp': if mode == 'nlp':
continue continue
elif mode == 'mm': elif mode == 'mm':
table_blocks = para_block.get('blocks') for block in para_block['blocks']:
for table_block in table_blocks: if block['type'] == BlockType.TableBody:
if table_block.get('type') == BlockType.TableBody: for line in block['lines']:
for line in table_block.get('lines'):
for span in line['spans']: for span in line['spans']:
if span.get('type') == ContentType.Table: if span['type'] == ContentType.Table:
para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n" para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
for table_block in table_blocks: for block in para_block['blocks']:
if table_block.get('type') == BlockType.TableCaption: if block['type'] == BlockType.TableCaption:
para_text += merge_para_with_text(table_block) para_text += merge_para_with_text(block)
elif table_block.get('type') == BlockType.TableFootnote: elif block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(table_block) para_text += merge_para_with_text(block)
if para_text.strip() == '': if para_text.strip() == '':
continue continue
...@@ -141,11 +139,11 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): ...@@ -141,11 +139,11 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
return page_markdown return page_markdown
def merge_para_with_text(para): def merge_para_with_text(para_block):
para_text = '' para_text = ''
for line in para['lines']: for line in para_block['lines']:
for span in line['spans']: for span in line['spans']:
span_type = span.get('type') span_type = span['type']
content = '' content = ''
language = '' language = ''
if span_type == ContentType.Text: if span_type == ContentType.Text:
...@@ -159,6 +157,7 @@ def merge_para_with_text(para): ...@@ -159,6 +157,7 @@ def merge_para_with_text(para):
content = f"${span['content']}$" content = f"${span['content']}$"
elif span_type == ContentType.InterlineEquation: elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n" content = f"\n$$\n{span['content']}\n$$\n"
if content != '': if content != '':
if language == 'en': # 英文语境下 content间需要空格分隔 if language == 'en': # 英文语境下 content间需要空格分隔
para_text += content + ' ' para_text += content + ' '
......
...@@ -61,7 +61,7 @@ def parse_pdf_by_ocr(pdf_bytes, ...@@ -61,7 +61,7 @@ def parse_pdf_by_ocr(pdf_bytes,
'''将所有区块的bbox整理到一起''' '''将所有区块的bbox整理到一起'''
all_bboxes = ocr_prepare_bboxes_for_layout_split( all_bboxes = ocr_prepare_bboxes_for_layout_split(
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks, img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
interline_equation_blocks, page_w, page_h) interline_equations, page_w, page_h)
'''根据区块信息计算layout''' '''根据区块信息计算layout'''
page_boundry = [0, 0, page_w, page_h] page_boundry = [0, 0, page_w, page_h]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment