Commit 442f3684 authored by 赵小蒙's avatar 赵小蒙

fix complicated layout logic

parent 232964d0
class DropReason: class DropReason:
TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖,导致无法准确定位文字顺序 TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖,导致无法准确定位文字顺序
USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局,暂时不支持 COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局,暂时不支持
TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的 TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。 COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
......
...@@ -18,6 +18,22 @@ from magic_pdf.para.para_split_v2 import para_split ...@@ -18,6 +18,22 @@ from magic_pdf.para.para_split_v2 import para_split
from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
def remove_horizontal_overlap_block_which_smaller(all_bboxes):
useful_blocks = []
for bbox in all_bboxes:
useful_blocks.append({
"bbox": bbox[:4]
})
is_useful_block_horz_overlap, smaller_bbox = check_useful_block_horizontal_overlap(useful_blocks)
if is_useful_block_horz_overlap:
logger.warning(
f"skip this page, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
for bbox in all_bboxes.copy():
if smaller_bbox == bbox[:4]:
all_bboxes.remove(bbox)
return is_useful_block_horz_overlap, all_bboxes
def parse_pdf_by_ocr(pdf_bytes, def parse_pdf_by_ocr(pdf_bytes,
model_list, model_list,
imageWriter, imageWriter,
...@@ -25,6 +41,9 @@ def parse_pdf_by_ocr(pdf_bytes, ...@@ -25,6 +41,9 @@ def parse_pdf_by_ocr(pdf_bytes,
end_page_id=None, end_page_id=None,
debug_mode=False, debug_mode=False,
): ):
need_drop = False
drop_reason = ""
pdf_bytes_md5 = compute_md5(pdf_bytes) pdf_bytes_md5 = compute_md5(pdf_bytes)
pdf_docs = fitz.open("pdf", pdf_bytes) pdf_docs = fitz.open("pdf", pdf_bytes)
...@@ -66,16 +85,14 @@ def parse_pdf_by_ocr(pdf_bytes, ...@@ -66,16 +85,14 @@ def parse_pdf_by_ocr(pdf_bytes,
interline_equations, page_w, page_h) interline_equations, page_w, page_h)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """ """在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
useful_blocks = []
for bbox in all_bboxes: while True: # 循环检查左右重叠的情况,如果存在就删除掉较小的那个bbox,直到不存在左右重叠的情况
useful_blocks.append({ is_useful_block_horz_overlap, all_bboxes = remove_horizontal_overlap_block_which_smaller(all_bboxes)
"bbox": bbox[:4] if is_useful_block_horz_overlap:
}) need_drop = True
is_useful_block_horz_overlap = check_useful_block_horizontal_overlap(useful_blocks) drop_reason = DropReason.USEFUL_BLOCK_HOR_OVERLAP
if is_useful_block_horz_overlap: else:
logger.warning( break
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
continue
'''根据区块信息计算layout''' '''根据区块信息计算layout'''
page_boundry = [0, 0, page_w, page_h] page_boundry = [0, 0, page_w, page_h]
...@@ -84,19 +101,23 @@ def parse_pdf_by_ocr(pdf_bytes, ...@@ -84,19 +101,23 @@ def parse_pdf_by_ocr(pdf_bytes,
if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0: if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
logger.warning( logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}") f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
continue need_drop = True
drop_reason = DropReason.CAN_NOT_DETECT_PAGE_LAYOUT
"""以下去掉复杂的布局和超过2列的布局""" """以下去掉复杂的布局和超过2列的布局"""
if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]): # 复杂的布局 if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]): # 复杂的布局
logger.warning( logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}") f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}")
continue need_drop = True
drop_reason = DropReason.COMPLICATED_LAYOUT
layout_column_width = get_columns_cnt_of_layout(layout_tree) layout_column_width = get_columns_cnt_of_layout(layout_tree)
if layout_column_width > 2: # 去掉超过2列的布局pdf if layout_column_width > 2: # 去掉超过2列的布局pdf
logger.warning( logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}") f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
continue need_drop = True
drop_reason = DropReason.TOO_MANY_LAYOUT_COLUMNS
'''根据layout顺序,对当前页面所有需要留下的block进行排序''' '''根据layout顺序,对当前页面所有需要留下的block进行排序'''
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes) sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
...@@ -119,7 +140,8 @@ def parse_pdf_by_ocr(pdf_bytes, ...@@ -119,7 +140,8 @@ def parse_pdf_by_ocr(pdf_bytes,
'''构造pdf_info_dict''' '''构造pdf_info_dict'''
page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, discarded_blocks) images, tables, interline_equations, discarded_blocks,
need_drop, drop_reason)
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f"page_{page_id}"] = page_info
"""分段""" """分段"""
......
...@@ -32,6 +32,22 @@ from magic_pdf.libs.math import float_equal ...@@ -32,6 +32,22 @@ from magic_pdf.libs.math import float_equal
from magic_pdf.para.para_split_v2 import para_split from magic_pdf.para.para_split_v2 import para_split
from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
def remove_horizontal_overlap_block_which_smaller(all_bboxes):
useful_blocks = []
for bbox in all_bboxes:
useful_blocks.append({
"bbox": bbox[:4]
})
is_useful_block_horz_overlap, smaller_bbox = check_useful_block_horizontal_overlap(useful_blocks)
if is_useful_block_horz_overlap:
logger.warning(
f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}")
for bbox in all_bboxes.copy():
if smaller_bbox == bbox[:4]:
all_bboxes.remove(bbox)
return is_useful_block_horz_overlap, all_bboxes
def txt_spans_extract(pdf_page, inline_equations, interline_equations): def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
...@@ -91,6 +107,9 @@ def parse_pdf_by_txt( ...@@ -91,6 +107,9 @@ def parse_pdf_by_txt(
end_page_id=None, end_page_id=None,
debug_mode=False, debug_mode=False,
): ):
need_drop = False
drop_reason = ""
pdf_bytes_md5 = compute_md5(pdf_bytes) pdf_bytes_md5 = compute_md5(pdf_bytes)
pdf_docs = fitz.open("pdf", pdf_bytes) pdf_docs = fitz.open("pdf", pdf_bytes)
...@@ -141,16 +160,14 @@ def parse_pdf_by_txt( ...@@ -141,16 +160,14 @@ def parse_pdf_by_txt(
) )
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """ """在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
useful_blocks = []
for bbox in all_bboxes: while True: # 循环检查左右重叠的情况,如果存在就删除掉较小的那个bbox,直到不存在左右重叠的情况
useful_blocks.append({ is_useful_block_horz_overlap, all_bboxes = remove_horizontal_overlap_block_which_smaller(all_bboxes)
"bbox": bbox[:4] if is_useful_block_horz_overlap:
}) need_drop = True
is_useful_block_horz_overlap = check_useful_block_horizontal_overlap(useful_blocks) drop_reason = DropReason.USEFUL_BLOCK_HOR_OVERLAP
if is_useful_block_horz_overlap: else:
logger.warning( break
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
continue
'''根据区块信息计算layout''' '''根据区块信息计算layout'''
page_boundry = [0, 0, page_w, page_h] page_boundry = [0, 0, page_w, page_h]
...@@ -159,19 +176,22 @@ def parse_pdf_by_txt( ...@@ -159,19 +176,22 @@ def parse_pdf_by_txt(
if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0: if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
logger.warning( logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}") f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
continue need_drop = True
drop_reason = DropReason.CAN_NOT_DETECT_PAGE_LAYOUT
"""以下去掉复杂的布局和超过2列的布局""" """以下去掉复杂的布局和超过2列的布局"""
if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]): # 复杂的布局 if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]): # 复杂的布局
logger.warning( logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}") f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}")
continue need_drop = True
drop_reason = DropReason.COMPLICATED_LAYOUT
layout_column_width = get_columns_cnt_of_layout(layout_tree) layout_column_width = get_columns_cnt_of_layout(layout_tree)
if layout_column_width > 2: # 去掉超过2列的布局pdf if layout_column_width > 2: # 去掉超过2列的布局pdf
logger.warning( logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}") f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
continue need_drop = True
drop_reason = DropReason.TOO_MANY_LAYOUT_COLUMNS
"""根据layout顺序,对当前页面所有需要留下的block进行排序""" """根据layout顺序,对当前页面所有需要留下的block进行排序"""
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes) sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
...@@ -211,6 +231,8 @@ def parse_pdf_by_txt( ...@@ -211,6 +231,8 @@ def parse_pdf_by_txt(
tables, tables,
interline_equations, interline_equations,
discarded_blocks, discarded_blocks,
need_drop,
drop_reason
) )
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f"page_{page_id}"] = page_info
......
...@@ -55,7 +55,7 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, ...@@ -55,7 +55,7 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h,
def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, discarded_blocks): images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
return_dict = { return_dict = {
'preproc_blocks': blocks, 'preproc_blocks': blocks,
'layout_bboxes': layout_bboxes, 'layout_bboxes': layout_bboxes,
...@@ -66,5 +66,7 @@ def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page ...@@ -66,5 +66,7 @@ def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page
'tables': tables, 'tables': tables,
'interline_equations': interline_equations, 'interline_equations': interline_equations,
'discarded_blocks': discarded_blocks, 'discarded_blocks': discarded_blocks,
'need_drop': need_drop,
'drop_reason': drop_reason,
} }
return return_dict return return_dict
...@@ -180,7 +180,12 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool: ...@@ -180,7 +180,12 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
for i in range(len(useful_bboxes)): for i in range(len(useful_bboxes)):
for j in range(i + 1, len(useful_bboxes)): for j in range(i + 1, len(useful_bboxes)):
area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]): if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
return True if area_i > area_j:
return True, useful_bboxes[j]
else:
return True, useful_bboxes[i]
return False return False, None
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment