Commit 45ce99bf authored by 赵小蒙's avatar 赵小蒙

block type 字段名修复

增加remove_overlaps_min_blocks逻辑
parent 3d60c2e8
......@@ -16,3 +16,4 @@ class DropTag:
FOOTNOTE = "footnote"
NOT_IN_LAYOUT = "not_in_layout"
SPAN_OVERLAP = "span_overlap"
BLOCK_OVERLAP = "block_overlap"
......@@ -70,10 +70,6 @@ def parse_pdf_by_ocr(pdf_bytes,
'''根据layout顺序,对当前页面所有需要留下的block进行排序'''
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
'''block嵌套问题解决'''
#@todo 1. text block大框套小框,删除小框 2. 图片或文本框与舍弃框重叠,优先信任舍弃框 3. 文本框与标题框重叠,优先信任文本框
'''获取所有需要拼接的span资源'''
spans = magic_model.get_all_spans(page_id)
'''删除重叠spans中较小的那些'''
......
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import BlockType
......@@ -31,5 +33,28 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
'''block嵌套问题解决'''
# @todo 1. text block大框套小框,删除小框 2. 图片或文本框与舍弃框重叠,优先信任舍弃框 3. 文本框与标题框重叠,优先信任文本框
all_bboxes, dropped_blocks = remove_overlaps_min_blocks(all_bboxes)
return all_bboxes
def remove_overlaps_min_blocks(all_bboxes):
dropped_blocks = []
# 删除重叠blocks中较小的那些
for block1 in all_bboxes.copy():
for block2 in all_bboxes.copy():
if block1 != block2:
block1_box = block1[0], block1[1], block1[2], block1[3]
block2_box = block2[0], block2[1], block2[2], block2[3]
overlap_box = get_minbox_if_overlap_by_ratio(block1_box, block2_box, 0.8)
if overlap_box is not None:
bbox_to_remove = next(
(block for block in all_bboxes if [block[0], block[1], block[2], block[3]] == overlap_box),
None)
if bbox_to_remove is not None:
all_bboxes.remove(bbox_to_remove)
bbox_to_remove['tag'] = DropTag.BLOCK_OVERLAP
dropped_blocks.append(bbox_to_remove)
return all_bboxes, dropped_blocks
......@@ -150,7 +150,7 @@ def fill_spans_in_blocks(blocks, spans):
block_type = block[7]
block_bbox = block[0:4]
block_dict = {
'block_type': block_type,
'type': block_type,
'bbox': block_bbox,
}
block_spans = []
......@@ -190,7 +190,7 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
'''
fix_blocks = []
for block in block_with_spans:
block_type = block['block_type']
block_type = block['type']
if block_type == BlockType.Image:
block = fix_image_block(block, img_blocks)
......@@ -215,7 +215,7 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block = {
'bbox': block_bbox,
'block_type': block_type,
'type': block_type,
'lines': sort_block_lines
}
return block, block_spans
......@@ -229,7 +229,7 @@ def make_body_block(span: dict, block_bbox: list, block_type: str):
}
body_block = {
'bbox': block_bbox,
'block_type': block_type,
'type': block_type,
'lines': [body_line]
}
return body_block
......
......@@ -222,10 +222,10 @@ def get_qa_need_list_v2(blocks):
interline_equations = []
for block in blocks:
if block["block_type"] == BlockType.Image:
if block["type"] == BlockType.Image:
images.append(block)
elif block["block_type"] == BlockType.Table:
elif block["type"] == BlockType.Table:
tables.append(block)
elif block["block_type"] == BlockType.InterlineEquation:
elif block["type"] == BlockType.InterlineEquation:
interline_equations.append(block)
return images, tables, interline_equations
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment