Commit eeda90af authored by myhloli's avatar myhloli

fix(pdf_parse): improve span removal logic for all content types

- Update remove_outside_spans function to handle all content types
- Add processing for text and equation spans
- Improve overlap calculation for better accuracy
parent 6b9f816f
......@@ -385,9 +385,11 @@ def revert_group_blocks(blocks):
def remove_outside_spans(spans, all_bboxes):
image_bboxes = []
table_bboxes = []
all_block_bboxes = []
for block in all_bboxes:
block_type = block[7]
block_bbox = block[0:4]
all_block_bboxes.append(block_bbox)
if block_type == BlockType.ImageBody:
image_bboxes.append(block_bbox)
elif block_type == BlockType.TableBody:
......@@ -396,6 +398,7 @@ def remove_outside_spans(spans, all_bboxes):
continue
new_spans = []
for span in spans:
if span['type'] == ContentType.Image:
for block_bbox in image_bboxes:
......@@ -407,6 +410,11 @@ def remove_outside_spans(spans, all_bboxes):
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
new_spans.append(span)
break
elif span['type'] in [ContentType.Text, ContentType.InlineEquation, ContentType.InterlineEquation]:
for block_bbox in all_block_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
new_spans.append(span)
break
else:
new_spans.append(span)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment