Commit ad0d06b6 authored by myhloli's avatar myhloli

fix(pdf_parse): improve span removal logic for all content types

- Update remove_outside_spans function to handle all content types
- Add processing for text and equation spans
- Improve overlap calculation for better accuracy
parent 509128d5
......@@ -410,13 +410,11 @@ def remove_outside_spans(spans, all_bboxes):
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
new_spans.append(span)
break
elif span['type'] in [ContentType.Text, ContentType.InlineEquation, ContentType.InterlineEquation]:
else:
for block_bbox in other_block_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
new_spans.append(span)
break
else:
new_spans.append(span)
return new_spans
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment