Commit a48f1d14 authored by 赵小蒙's avatar 赵小蒙

skip complicated layout page

parent f01cb89f
...@@ -2,8 +2,9 @@ import time ...@@ -2,8 +2,9 @@ import time
from loguru import logger from loguru import logger
from magic_pdf.layout.layout_sort import get_bboxes_layout from magic_pdf.layout.layout_sort import get_bboxes_layout, LAYOUT_UNPROC, get_columns_cnt_of_layout
from magic_pdf.libs.convert_utils import dict_to_list from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.hash_utils import compute_md5 from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.commons import fitz, get_delta_time from magic_pdf.libs.commons import fitz, get_delta_time
from magic_pdf.model.magic_model import MagicModel from magic_pdf.model.magic_model import MagicModel
...@@ -14,6 +15,7 @@ from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_ ...@@ -14,6 +15,7 @@ from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_
from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2 from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
# from magic_pdf.para.para_split import para_split # from magic_pdf.para.para_split import para_split
from magic_pdf.para.para_split_v2 import para_split from magic_pdf.para.para_split_v2 import para_split
from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
def parse_pdf_by_ocr(pdf_bytes, def parse_pdf_by_ocr(pdf_bytes,
...@@ -63,10 +65,39 @@ def parse_pdf_by_ocr(pdf_bytes, ...@@ -63,10 +65,39 @@ def parse_pdf_by_ocr(pdf_bytes,
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks, img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
interline_equations, page_w, page_h) interline_equations, page_w, page_h)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
useful_blocks = []
for bbox in all_bboxes:
useful_blocks.append({
"bbox": bbox[:4]
})
is_useful_block_horz_overlap = check_useful_block_horizontal_overlap(useful_blocks)
if is_useful_block_horz_overlap:
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
continue
'''根据区块信息计算layout''' '''根据区块信息计算layout'''
page_boundry = [0, 0, page_w, page_h] page_boundry = [0, 0, page_w, page_h]
layout_bboxes, layout_tree = get_bboxes_layout(all_bboxes, page_boundry, page_id) layout_bboxes, layout_tree = get_bboxes_layout(all_bboxes, page_boundry, page_id)
if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
continue
"""以下去掉复杂的布局和超过2列的布局"""
if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]): # 复杂的布局
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}")
continue
layout_column_width = get_columns_cnt_of_layout(layout_tree)
if layout_column_width > 2: # 去掉超过2列的布局pdf
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
continue
'''根据layout顺序,对当前页面所有需要留下的block进行排序''' '''根据layout顺序,对当前页面所有需要留下的block进行排序'''
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes) sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
......
...@@ -2,8 +2,9 @@ import time ...@@ -2,8 +2,9 @@ import time
from loguru import logger from loguru import logger
from magic_pdf.layout.layout_sort import get_bboxes_layout from magic_pdf.layout.layout_sort import get_bboxes_layout, LAYOUT_UNPROC, get_columns_cnt_of_layout
from magic_pdf.libs.convert_utils import dict_to_list from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.hash_utils import compute_md5 from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.commons import fitz, get_delta_time from magic_pdf.libs.commons import fitz, get_delta_time
from magic_pdf.model.magic_model import MagicModel from magic_pdf.model.magic_model import MagicModel
...@@ -33,6 +34,8 @@ from magic_pdf.pre_proc.equations_replace import ( ...@@ -33,6 +34,8 @@ from magic_pdf.pre_proc.equations_replace import (
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from magic_pdf.libs.math import float_equal from magic_pdf.libs.math import float_equal
from magic_pdf.para.para_split_v2 import para_split from magic_pdf.para.para_split_v2 import para_split
from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
def txt_spans_extract(pdf_page, inline_equations, interline_equations): def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
...@@ -123,11 +126,38 @@ def parse_pdf_by_txt( ...@@ -123,11 +126,38 @@ def parse_pdf_by_txt(
page_h, page_h,
) )
"""根据区块信息计算layout""" """在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
useful_blocks = []
for bbox in all_bboxes:
useful_blocks.append({
"bbox": bbox[:4]
})
is_useful_block_horz_overlap = check_useful_block_horizontal_overlap(useful_blocks)
if is_useful_block_horz_overlap:
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
continue
'''根据区块信息计算layout'''
page_boundry = [0, 0, page_w, page_h] page_boundry = [0, 0, page_w, page_h]
layout_bboxes, layout_tree = get_bboxes_layout( layout_bboxes, layout_tree = get_bboxes_layout(all_bboxes, page_boundry, page_id)
all_bboxes, page_boundry, page_id
) if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
continue
"""以下去掉复杂的布局和超过2列的布局"""
if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]): # 复杂的布局
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}")
continue
layout_column_width = get_columns_cnt_of_layout(layout_tree)
if layout_column_width > 2: # 去掉超过2列的布局pdf
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
continue
"""根据layout顺序,对当前页面所有需要留下的block进行排序""" """根据layout顺序,对当前页面所有需要留下的block进行排序"""
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes) sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
......
""" """
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍 从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
1. 首先去掉出现在图片上的bbox,图片包括表格和图片 1. 首先去掉出现在图片上的bbox,图片包括表格和图片
...@@ -9,7 +8,8 @@ from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_over ...@@ -9,7 +8,8 @@ from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_over
from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:list, inline_equations:list, text_raw_blocks:list): def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
text_raw_blocks: list):
""" """
text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
当下采用一种粗暴的方式: 当下采用一种粗暴的方式:
...@@ -87,7 +87,7 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations: ...@@ -87,7 +87,7 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
# 图片和图片重叠,两张都暂时不参与版面计算 # 图片和图片重叠,两张都暂时不参与版面计算
images_dup_index = [] images_dup_index = []
for i in range(len(images)): for i in range(len(images)):
for j in range(i+1, len(images)): for j in range(i + 1, len(images)):
if _is_in_or_part_overlap(images[i], images[j]): if _is_in_or_part_overlap(images[i], images[j]):
images_dup_index.append(i) images_dup_index.append(i)
images_dup_index.append(j) images_dup_index.append(j)
...@@ -121,25 +121,25 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations: ...@@ -121,25 +121,25 @@ def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:
return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2 return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
def check_text_block_horizontal_overlap(text_blocks:list, header, footer) -> bool: def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
""" """
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。 因为这种情况大概率发生了公式没有被检测出来。
""" """
if len(text_blocks)==0: if len(text_blocks) == 0:
return False return False
page_min_y = 0 page_min_y = 0
page_max_y = max(yy['bbox'][3] for yy in text_blocks) page_max_y = max(yy['bbox'][3] for yy in text_blocks)
def __max_y(lst:list): def __max_y(lst: list):
if len(lst)>0: if len(lst) > 0:
return max([item[1] for item in lst]) return max([item[1] for item in lst])
return page_min_y return page_min_y
def __min_y(lst:list): def __min_y(lst: list):
if len(lst)>0: if len(lst) > 0:
return min([item[3] for item in lst]) return min([item[3] for item in lst])
return page_max_y return page_max_y
...@@ -149,13 +149,38 @@ def check_text_block_horizontal_overlap(text_blocks:list, header, footer) -> boo ...@@ -149,13 +149,38 @@ def check_text_block_horizontal_overlap(text_blocks:list, header, footer) -> boo
txt_bboxes = [] txt_bboxes = []
for text_block in text_blocks: for text_block in text_blocks:
bbox = text_block["bbox"] bbox = text_block["bbox"]
if bbox[1]>=clip_y0 and bbox[3]<=clip_y1: if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
txt_bboxes.append(bbox) txt_bboxes.append(bbox)
for i in range(len(txt_bboxes)): for i in range(len(txt_bboxes)):
for j in range(i+1, len(txt_bboxes)): for j in range(i + 1, len(txt_bboxes)):
if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]): if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
return True return True
return False return False
def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
"""
if len(useful_blocks) == 0:
return False
page_min_y = 0
page_max_y = max(yy['bbox'][3] for yy in useful_blocks)
useful_bboxes = []
for text_block in useful_blocks:
bbox = text_block["bbox"]
if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
useful_bboxes.append(bbox)
for i in range(len(useful_bboxes)):
for j in range(i + 1, len(useful_bboxes)):
if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
return True
return False
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment