Commit 873179a5 authored by quyuan's avatar quyuan

Merge branch 'master' of https://github.com/magicpdf/Magic-PDF

parents d69928af 6c8361fa
......@@ -57,8 +57,8 @@ def fix_text_overlap_title_blocks(all_bboxes):
for text_block in text_blocks:
for title_block in title_blocks:
text_block_bbox = text_block[0], text_block[1], text_block[2], text_block[3]
title_block_bbox = title_block[0], title_block[1], title_block[2], title_block[3]
text_block_bbox = text_block[:4]
title_block_bbox = title_block[:4]
if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
all_bboxes.remove(title_block)
......@@ -66,27 +66,34 @@ def fix_text_overlap_title_blocks(all_bboxes):
def remove_need_drop_blocks(all_bboxes, discarded_blocks):
for block in all_bboxes.copy():
need_remove = []
for block in all_bboxes:
for discarded_block in discarded_blocks:
block_bbox = block[0], block[1], block[2], block[3]
block_bbox = block[:4]
if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
all_bboxes.remove(block)
need_remove.append(block)
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes
def remove_overlaps_min_blocks(all_bboxes):
# 删除重叠blocks中较小的那些
for block1 in all_bboxes.copy():
for block2 in all_bboxes.copy():
need_remove = []
for block1 in all_bboxes:
for block2 in all_bboxes:
if block1 != block2:
block1_bbox = [block1[0], block1[1], block1[2], block1[3]]
block2_bbox = [block2[0], block2[1], block2[2], block2[3]]
block1_bbox = block1[:4]
block2_bbox = block2[:4]
overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
if overlap_box is not None:
bbox_to_remove = next(
(block for block in all_bboxes if [block[0], block[1], block[2], block[3]] == overlap_box),
None)
bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
if bbox_to_remove is not None:
all_bboxes.remove(bbox_to_remove)
need_remove.append(bbox_to_remove)
if len(need_remove) > 0:
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes
......@@ -9,16 +9,19 @@ from magic_pdf.libs.ocr_content_type import ContentType, BlockType
def remove_overlaps_min_spans(spans):
dropped_spans = []
# 删除重叠spans中较小的那些
for span1 in spans.copy():
for span2 in spans.copy():
for span1 in spans:
for span2 in spans:
if span1 != span2:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
if overlap_box is not None:
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if bbox_to_remove is not None:
spans.remove(bbox_to_remove)
bbox_to_remove['tag'] = DropTag.SPAN_OVERLAP
dropped_spans.append(bbox_to_remove)
if len(dropped_spans > 0):
for dropped_span in dropped_spans:
spans.remove(dropped_span)
dropped_span['tag'] = DropTag.SPAN_OVERLAP
return spans, dropped_spans
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment