Commit a817075b authored by 赵小蒙's avatar 赵小蒙

update discarded block and spans build logic

parent d4f96a05
......@@ -17,4 +17,5 @@ class BlockType:
Title = "title"
InterlineEquation = "interline_equation"
Footnote = "footnote"
Discarded = "discarded"
......@@ -17,7 +17,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.equations_replace import remove_chars_in_text_blocks, replace_equations_in_textblock, \
combine_chars_to_pymudict
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans
from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans, \
fix_discarded_block
from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
......@@ -122,15 +123,19 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
'''将所有区块的bbox整理到一起'''
all_bboxes = ocr_prepare_bboxes_for_layout_split(
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split(
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
interline_equations, page_w, page_h)
'''先处理不需要排版的discarded_blocks'''
discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)
fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
'''如果当前页面没有bbox则跳过'''
if len(all_bboxes) == 0:
logger.warning(f"skip this page, not found bbox, page_id: {page_id}")
logger.warning(f"skip this page, not found useful bbox, page_id: {page_id}")
return ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [],
[], [], interline_equations, discarded_blocks,
[], [], interline_equations, fix_discarded_blocks,
need_drop, drop_reason)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
......@@ -171,7 +176,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
'''将span填入排好序的blocks中'''
block_with_spans = fill_spans_in_blocks(sorted_blocks, spans)
block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.6)
'''对block进行fix操作'''
fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
......@@ -181,7 +186,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
'''构造pdf_info_dict'''
page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, discarded_blocks,
images, tables, interline_equations, fix_discarded_blocks,
need_drop, drop_reason)
return page_info
......
......@@ -7,6 +7,7 @@ from magic_pdf.libs.ocr_content_type import BlockType
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
title_blocks, interline_equation_blocks, page_w, page_h):
all_bboxes = []
all_discarded_blocks = []
for image in img_blocks:
x0, y0, x1, y1 = image['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None])
......@@ -38,10 +39,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
for discarded in discarded_blocks:
x0, y0, x1, y1 = discarded['bbox']
all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None])
# 将footnote加入到all_bboxes中,用来计算layout
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
return all_bboxes
return all_bboxes, all_discarded_blocks
def fix_text_overlap_title_blocks(all_bboxes):
......
......@@ -141,7 +141,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
return sort_blocks
def fill_spans_in_blocks(blocks, spans):
def fill_spans_in_blocks(blocks, spans, radio):
'''
将allspans中的span按位置关系,放入blocks中
'''
......@@ -156,7 +156,7 @@ def fill_spans_in_blocks(blocks, spans):
block_spans = []
for span in spans:
span_bbox = span['bbox']
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.6:
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio:
block_spans.append(span)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
......@@ -178,7 +178,7 @@ def fill_spans_in_blocks(blocks, spans):
for span in block_spans:
spans.remove(span)
return block_with_spans
return block_with_spans, spans
def fix_block_spans(block_with_spans, img_blocks, table_blocks):
......@@ -204,6 +204,14 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
return fix_blocks
def fix_discarded_block(discarded_block_with_spans):
fix_discarded_blocks = []
for block in discarded_block_with_spans:
block = fix_text_block(block)
fix_discarded_blocks.append(block)
return fix_discarded_blocks
def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
block_spans = []
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment