Unverified Commit 11bd9432 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub

Merge pull request #831 from opendatalab/dev

fix(pdf_parse): improve span removal logic for all content types
parents 4e685524 73afb7d6
...@@ -385,17 +385,20 @@ def revert_group_blocks(blocks): ...@@ -385,17 +385,20 @@ def revert_group_blocks(blocks):
def remove_outside_spans(spans, all_bboxes): def remove_outside_spans(spans, all_bboxes):
image_bboxes = [] image_bboxes = []
table_bboxes = [] table_bboxes = []
other_block_bboxes = []
for block in all_bboxes: for block in all_bboxes:
block_type = block[7] block_type = block[7]
block_bbox = block[0:4] block_bbox = block[0:4]
if block_type == BlockType.ImageBody: if block_type == BlockType.ImageBody:
image_bboxes.append(block_bbox) image_bboxes.append(block_bbox)
elif block_type == BlockType.TableBody: elif block_type == BlockType.TableBody:
table_bboxes.append(block_bbox) table_bboxes.append(block_bbox)
else: else:
continue other_block_bboxes.append(block_bbox)
new_spans = [] new_spans = []
for span in spans: for span in spans:
if span['type'] == ContentType.Image: if span['type'] == ContentType.Image:
for block_bbox in image_bboxes: for block_bbox in image_bboxes:
...@@ -408,7 +411,10 @@ def remove_outside_spans(spans, all_bboxes): ...@@ -408,7 +411,10 @@ def remove_outside_spans(spans, all_bboxes):
new_spans.append(span) new_spans.append(span)
break break
else: else:
new_spans.append(span) for block_bbox in other_block_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
new_spans.append(span)
break
return new_spans return new_spans
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment