Commit a817075b authored by 赵小蒙's avatar 赵小蒙

update discarded block and spans build logic

parent d4f96a05
...@@ -17,4 +17,5 @@ class BlockType: ...@@ -17,4 +17,5 @@ class BlockType:
Title = "title" Title = "title"
InterlineEquation = "interline_equation" InterlineEquation = "interline_equation"
Footnote = "footnote" Footnote = "footnote"
Discarded = "discarded"
...@@ -17,7 +17,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table ...@@ -17,7 +17,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.equations_replace import remove_chars_in_text_blocks, replace_equations_in_textblock, \ from magic_pdf.pre_proc.equations_replace import remove_chars_in_text_blocks, replace_equations_in_textblock, \
combine_chars_to_pymudict combine_chars_to_pymudict
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans, \
fix_discarded_block
from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2 from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
...@@ -122,15 +123,19 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, ...@@ -122,15 +123,19 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter) spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
'''将所有区块的bbox整理到一起''' '''将所有区块的bbox整理到一起'''
all_bboxes = ocr_prepare_bboxes_for_layout_split( all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split(
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks, img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
interline_equations, page_w, page_h) interline_equations, page_w, page_h)
'''先处理不需要排版的discarded_blocks'''
discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)
fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
'''如果当前页面没有bbox则跳过''' '''如果当前页面没有bbox则跳过'''
if len(all_bboxes) == 0: if len(all_bboxes) == 0:
logger.warning(f"skip this page, not found bbox, page_id: {page_id}") logger.warning(f"skip this page, not found useful bbox, page_id: {page_id}")
return ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [], return ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [],
[], [], interline_equations, discarded_blocks, [], [], interline_equations, fix_discarded_blocks,
need_drop, drop_reason) need_drop, drop_reason)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """ """在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
...@@ -171,7 +176,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, ...@@ -171,7 +176,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes) sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
'''将span填入排好序的blocks中''' '''将span填入排好序的blocks中'''
block_with_spans = fill_spans_in_blocks(sorted_blocks, spans) block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.6)
'''对block进行fix操作''' '''对block进行fix操作'''
fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks) fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
...@@ -181,7 +186,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, ...@@ -181,7 +186,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
'''构造pdf_info_dict''' '''构造pdf_info_dict'''
page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, discarded_blocks, images, tables, interline_equations, fix_discarded_blocks,
need_drop, drop_reason) need_drop, drop_reason)
return page_info return page_info
......
...@@ -7,6 +7,7 @@ from magic_pdf.libs.ocr_content_type import BlockType ...@@ -7,6 +7,7 @@ from magic_pdf.libs.ocr_content_type import BlockType
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks, def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
title_blocks, interline_equation_blocks, page_w, page_h): title_blocks, interline_equation_blocks, page_w, page_h):
all_bboxes = [] all_bboxes = []
all_discarded_blocks = []
for image in img_blocks: for image in img_blocks:
x0, y0, x1, y1 = image['bbox'] x0, y0, x1, y1 = image['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None]) all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None])
...@@ -38,10 +39,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc ...@@ -38,10 +39,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)''' '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
for discarded in discarded_blocks: for discarded in discarded_blocks:
x0, y0, x1, y1 = discarded['bbox'] x0, y0, x1, y1 = discarded['bbox']
all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None])
# 将footnote加入到all_bboxes中,用来计算layout
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2): if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None]) all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
return all_bboxes return all_bboxes, all_discarded_blocks
def fix_text_overlap_title_blocks(all_bboxes): def fix_text_overlap_title_blocks(all_bboxes):
......
...@@ -141,7 +141,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes): ...@@ -141,7 +141,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
return sort_blocks return sort_blocks
def fill_spans_in_blocks(blocks, spans): def fill_spans_in_blocks(blocks, spans, radio):
''' '''
将allspans中的span按位置关系,放入blocks中 将allspans中的span按位置关系,放入blocks中
''' '''
...@@ -156,7 +156,7 @@ def fill_spans_in_blocks(blocks, spans): ...@@ -156,7 +156,7 @@ def fill_spans_in_blocks(blocks, spans):
block_spans = [] block_spans = []
for span in spans: for span in spans:
span_bbox = span['bbox'] span_bbox = span['bbox']
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.6: if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio:
block_spans.append(span) block_spans.append(span)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)''' '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
...@@ -178,7 +178,7 @@ def fill_spans_in_blocks(blocks, spans): ...@@ -178,7 +178,7 @@ def fill_spans_in_blocks(blocks, spans):
for span in block_spans: for span in block_spans:
spans.remove(span) spans.remove(span)
return block_with_spans return block_with_spans, spans
def fix_block_spans(block_with_spans, img_blocks, table_blocks): def fix_block_spans(block_with_spans, img_blocks, table_blocks):
...@@ -204,6 +204,14 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks): ...@@ -204,6 +204,14 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
return fix_blocks return fix_blocks
def fix_discarded_block(discarded_block_with_spans):
fix_discarded_blocks = []
for block in discarded_block_with_spans:
block = fix_text_block(block)
fix_discarded_blocks.append(block)
return fix_discarded_blocks
def merge_spans_to_block(spans: list, block_bbox: list, block_type: str): def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
block_spans = [] block_spans = []
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中 # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment