Commit f01cb89f authored by 赵小蒙's avatar 赵小蒙

fix lost image or table bug

parent 89198bfe
...@@ -106,29 +106,30 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): ...@@ -106,29 +106,30 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
if mode == 'nlp': if mode == 'nlp':
continue continue
elif mode == 'mm': elif mode == 'mm':
for block in para_block['blocks']: for block in para_block['blocks']: # 1st.拼image_body
if block['type'] == BlockType.ImageBody: if block['type'] == BlockType.ImageBody:
for line in block['lines']: for line in block['lines']:
for span in line['spans']: for span in line['spans']:
if span['type'] == ContentType.Image: if span['type'] == ContentType.Image:
para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n" para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
for block in para_block['blocks']: for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageCaption: if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(block) para_text += merge_para_with_text(block)
elif para_type == BlockType.Table: elif para_type == BlockType.Table:
if mode == 'nlp': if mode == 'nlp':
continue continue
elif mode == 'mm': elif mode == 'mm':
for block in para_block['blocks']: for block in para_block['blocks']: # 1st.拼table_caption
if block['type'] == BlockType.TableCaption:
para_text += merge_para_with_text(block)
for block in para_block['blocks']: # 2nd.拼table_body
if block['type'] == BlockType.TableBody: if block['type'] == BlockType.TableBody:
for line in block['lines']: for line in block['lines']:
for span in line['spans']: for span in line['spans']:
if span['type'] == ContentType.Table: if span['type'] == ContentType.Table:
para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n" para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
for block in para_block['blocks']: for block in para_block['blocks']: # 3rd.拼table_footnote
if block['type'] == BlockType.TableCaption: if block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block)
elif block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block) para_text += merge_para_with_text(block)
if para_text.strip() == '': if para_text.strip() == '':
......
...@@ -156,7 +156,7 @@ def fill_spans_in_blocks(blocks, spans): ...@@ -156,7 +156,7 @@ def fill_spans_in_blocks(blocks, spans):
block_spans = [] block_spans = []
for span in spans: for span in spans:
span_bbox = span['bbox'] span_bbox = span['bbox']
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.7: if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.6:
block_spans.append(span) block_spans.append(span)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)''' '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
...@@ -167,8 +167,8 @@ def fill_spans_in_blocks(blocks, spans): ...@@ -167,8 +167,8 @@ def fill_spans_in_blocks(blocks, spans):
'''模型识别错误的行间公式, type类型转换成行内公式''' '''模型识别错误的行间公式, type类型转换成行内公式'''
block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines) block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
'''bbox去除粘连''' '''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
block_spans = remove_overlap_between_bbox(block_spans) # block_spans = remove_overlap_between_bbox(block_spans)
block_dict['spans'] = block_spans block_dict['spans'] = block_spans
block_with_spans.append(block_dict) block_with_spans.append(block_dict)
...@@ -208,7 +208,7 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str): ...@@ -208,7 +208,7 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
block_spans = [] block_spans = []
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中 # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
for span in spans: for span in spans:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.8: if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.6:
block_spans.append(span) block_spans.append(span)
block_lines = merge_spans_to_line(block_spans) block_lines = merge_spans_to_line(block_spans)
# 对line中的span进行排序 # 对line中的span进行排序
...@@ -268,6 +268,7 @@ def fix_table_block(block, table_blocks): ...@@ -268,6 +268,7 @@ def fix_table_block(block, table_blocks):
# 遍历table_blocks,找到与当前block匹配的table_block # 遍历table_blocks,找到与当前block匹配的table_block
for table_block in table_blocks: for table_block in table_blocks:
if table_block['bbox'] == block['bbox']: if table_block['bbox'] == block['bbox']:
# 创建table_body_block # 创建table_body_block
for span in block['spans']: for span in block['spans']:
if span['type'] == ContentType.Table and span['bbox'] == table_block['table_body_bbox']: if span['type'] == ContentType.Table and span['bbox'] == table_block['table_body_bbox']:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment