Unverified Commit a77cb36d authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #41 from myhloli/master

block type 字段名修复
parents af84a8ac 45ce99bf
...@@ -16,3 +16,4 @@ class DropTag: ...@@ -16,3 +16,4 @@ class DropTag:
FOOTNOTE = "footnote" FOOTNOTE = "footnote"
NOT_IN_LAYOUT = "not_in_layout" NOT_IN_LAYOUT = "not_in_layout"
SPAN_OVERLAP = "span_overlap" SPAN_OVERLAP = "span_overlap"
BLOCK_OVERLAP = "block_overlap"
...@@ -70,10 +70,6 @@ def parse_pdf_by_ocr(pdf_bytes, ...@@ -70,10 +70,6 @@ def parse_pdf_by_ocr(pdf_bytes,
'''根据layout顺序,对当前页面所有需要留下的block进行排序''' '''根据layout顺序,对当前页面所有需要留下的block进行排序'''
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes) sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
'''block嵌套问题解决'''
#@todo 1. text block大框套小框,删除小框 2. 图片或文本框与舍弃框重叠,优先信任舍弃框 3. 文本框与标题框重叠,优先信任文本框
'''获取所有需要拼接的span资源''' '''获取所有需要拼接的span资源'''
spans = magic_model.get_all_spans(page_id) spans = magic_model.get_all_spans(page_id)
'''删除重叠spans中较小的那些''' '''删除重叠spans中较小的那些'''
......
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import BlockType from magic_pdf.libs.ocr_content_type import BlockType
...@@ -31,5 +33,28 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc ...@@ -31,5 +33,28 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2): if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None]) all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
'''block嵌套问题解决'''
# @todo 1. text block大框套小框,删除小框 2. 图片或文本框与舍弃框重叠,优先信任舍弃框 3. 文本框与标题框重叠,优先信任文本框
all_bboxes, dropped_blocks = remove_overlaps_min_blocks(all_bboxes)
return all_bboxes return all_bboxes
def remove_overlaps_min_blocks(all_bboxes):
dropped_blocks = []
# 删除重叠blocks中较小的那些
for block1 in all_bboxes.copy():
for block2 in all_bboxes.copy():
if block1 != block2:
block1_box = block1[0], block1[1], block1[2], block1[3]
block2_box = block2[0], block2[1], block2[2], block2[3]
overlap_box = get_minbox_if_overlap_by_ratio(block1_box, block2_box, 0.8)
if overlap_box is not None:
bbox_to_remove = next(
(block for block in all_bboxes if [block[0], block[1], block[2], block[3]] == overlap_box),
None)
if bbox_to_remove is not None:
all_bboxes.remove(bbox_to_remove)
bbox_to_remove['tag'] = DropTag.BLOCK_OVERLAP
dropped_blocks.append(bbox_to_remove)
return all_bboxes, dropped_blocks
...@@ -150,7 +150,7 @@ def fill_spans_in_blocks(blocks, spans): ...@@ -150,7 +150,7 @@ def fill_spans_in_blocks(blocks, spans):
block_type = block[7] block_type = block[7]
block_bbox = block[0:4] block_bbox = block[0:4]
block_dict = { block_dict = {
'block_type': block_type, 'type': block_type,
'bbox': block_bbox, 'bbox': block_bbox,
} }
block_spans = [] block_spans = []
...@@ -190,7 +190,7 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks): ...@@ -190,7 +190,7 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
''' '''
fix_blocks = [] fix_blocks = []
for block in block_with_spans: for block in block_with_spans:
block_type = block['block_type'] block_type = block['type']
if block_type == BlockType.Image: if block_type == BlockType.Image:
block = fix_image_block(block, img_blocks) block = fix_image_block(block, img_blocks)
...@@ -215,7 +215,7 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str): ...@@ -215,7 +215,7 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
sort_block_lines = line_sort_spans_by_left_to_right(block_lines) sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block = { block = {
'bbox': block_bbox, 'bbox': block_bbox,
'block_type': block_type, 'type': block_type,
'lines': sort_block_lines 'lines': sort_block_lines
} }
return block, block_spans return block, block_spans
...@@ -229,7 +229,7 @@ def make_body_block(span: dict, block_bbox: list, block_type: str): ...@@ -229,7 +229,7 @@ def make_body_block(span: dict, block_bbox: list, block_type: str):
} }
body_block = { body_block = {
'bbox': block_bbox, 'bbox': block_bbox,
'block_type': block_type, 'type': block_type,
'lines': [body_line] 'lines': [body_line]
} }
return body_block return body_block
......
...@@ -222,10 +222,10 @@ def get_qa_need_list_v2(blocks): ...@@ -222,10 +222,10 @@ def get_qa_need_list_v2(blocks):
interline_equations = [] interline_equations = []
for block in blocks: for block in blocks:
if block["block_type"] == BlockType.Image: if block["type"] == BlockType.Image:
images.append(block) images.append(block)
elif block["block_type"] == BlockType.Table: elif block["type"] == BlockType.Table:
tables.append(block) tables.append(block)
elif block["block_type"] == BlockType.InterlineEquation: elif block["type"] == BlockType.InterlineEquation:
interline_equations.append(block) interline_equations.append(block)
return images, tables, interline_equations return images, tables, interline_equations
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment