Unverified Commit 6c8361fa authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #69 from myhloli/master

change remove spans logic
parents 7a745664 fcf94b2d
...@@ -9,16 +9,19 @@ from magic_pdf.libs.ocr_content_type import ContentType, BlockType ...@@ -9,16 +9,19 @@ from magic_pdf.libs.ocr_content_type import ContentType, BlockType
def remove_overlaps_min_spans(spans): def remove_overlaps_min_spans(spans):
dropped_spans = [] dropped_spans = []
# 删除重叠spans中较小的那些 # 删除重叠spans中较小的那些
for span1 in spans.copy(): for span1 in spans:
for span2 in spans.copy(): for span2 in spans:
if span1 != span2: if span1 != span2:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65) overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
if overlap_box is not None: if overlap_box is not None:
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None) bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if bbox_to_remove is not None: if bbox_to_remove is not None:
spans.remove(bbox_to_remove)
bbox_to_remove['tag'] = DropTag.SPAN_OVERLAP
dropped_spans.append(bbox_to_remove) dropped_spans.append(bbox_to_remove)
if len(dropped_spans > 0):
for dropped_span in dropped_spans:
spans.remove(dropped_span)
dropped_span['tag'] = DropTag.SPAN_OVERLAP
return spans, dropped_spans return spans, dropped_spans
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment