Commit 3c145ba0 authored by 赵小蒙's avatar 赵小蒙

fix: some text char removed by interline_equations overlap

parent 999b698f
...@@ -107,6 +107,7 @@ def _is_in_or_part_overlap(box1, box2) -> bool: ...@@ -107,6 +107,7 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
or y0_1 > y1_2 or y0_1 > y1_2
) # box1在box2的下边 ) # box1在box2的下边
def remove_text_block_overlap_interline_equation_bbox( def remove_text_block_overlap_interline_equation_bbox(
interline_eq_bboxes, pymu_block_list interline_eq_bboxes, pymu_block_list
): ):
...@@ -123,7 +124,7 @@ def remove_text_block_overlap_interline_equation_bbox( ...@@ -123,7 +124,7 @@ def remove_text_block_overlap_interline_equation_bbox(
for char in span["chars"]: for char in span["chars"]:
if any( if any(
[ [
_is_in_or_part_overlap(char["bbox"], eq_bbox["bbox"]) (calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], char["bbox"]) > 0.5)
for eq_bbox in interline_eq_bboxes for eq_bbox in interline_eq_bboxes
] ]
): ):
......
...@@ -160,12 +160,12 @@ def fill_spans_in_blocks(blocks, spans, radio): ...@@ -160,12 +160,12 @@ def fill_spans_in_blocks(blocks, spans, radio):
block_spans.append(span) block_spans.append(span)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)''' '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list = [] # displayed_list = []
text_inline_lines = [] # text_inline_lines = []
modify_y_axis(block_spans, displayed_list, text_inline_lines) # modify_y_axis(block_spans, displayed_list, text_inline_lines)
'''模型识别错误的行间公式, type类型转换成行内公式''' '''模型识别错误的行间公式, type类型转换成行内公式'''
block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines) # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
'''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错 '''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
# block_spans = remove_overlap_between_bbox_for_span(block_spans) # block_spans = remove_overlap_between_bbox_for_span(block_spans)
...@@ -196,8 +196,10 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks): ...@@ -196,8 +196,10 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
block = fix_image_block(block, img_blocks) block = fix_image_block(block, img_blocks)
elif block_type == BlockType.Table: elif block_type == BlockType.Table:
block = fix_table_block(block, table_blocks) block = fix_table_block(block, table_blocks)
elif block_type in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]: elif block_type in [BlockType.Text, BlockType.Title]:
block = fix_text_block(block) block = fix_text_block(block)
elif block_type == BlockType.InterlineEquation:
block = fix_interline_block(block)
else: else:
continue continue
fix_blocks.append(block) fix_blocks.append(block)
...@@ -315,6 +317,18 @@ def fix_table_block(block, table_blocks): ...@@ -315,6 +317,18 @@ def fix_table_block(block, table_blocks):
def fix_text_block(block): def fix_text_block(block):
# 文本block中的公式span都应该转换成行内type
for span in block['spans']:
if span['type'] == ContentType.InterlineEquation:
span['type'] = ContentType.InlineEquation
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
def fix_interline_block(block):
block_lines = merge_spans_to_line(block['spans']) block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines) sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines block['lines'] = sort_block_lines
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment