Unverified Commit 0fb9619a authored by Kaiwen Liu's avatar Kaiwen Liu Committed by GitHub

Merge branch 'magicpdf:master' into master

parents 8c3a37ff eebd9767
......@@ -24,7 +24,7 @@ from magic_pdf.pre_proc.ocr_dict_merge import (
from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \
adjust_bbox_for_standalone_block, modify_y_axis, modify_inline_equation, get_qa_need_list, \
remove_spans_by_bboxes_dict
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
def parse_pdf_by_ocr(
......@@ -158,8 +158,7 @@ def parse_pdf_by_ocr(
spans = modify_inline_equation(spans, displayed_list, text_inline_lines)
'''bbox去除粘连'''
spans = remove_overlap_between_bbox(spans)
spans = remove_overlap_between_bbox_for_span(spans)
'''
对tpye=["interline_equation", "image", "table"]进行额外处理,
如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
......
......@@ -20,7 +20,6 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans, \
fix_discarded_block
from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
......@@ -98,7 +97,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
img_blocks = magic_model.get_imgs(page_id)
table_blocks = magic_model.get_tables(page_id)
discarded_blocks = magic_model.get_discarded(page_id)
text_blocks = remove_overlap_between_bbox(magic_model.get_text_blocks(page_id))
text_blocks = magic_model.get_text_blocks(page_id)
title_blocks = magic_model.get_title_blocks(page_id)
inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id)
......
......@@ -2,6 +2,7 @@ from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_ove
calculate_iou
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import BlockType
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
......@@ -35,6 +36,8 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes = remove_overlap_between_bbox_for_block(all_bboxes)
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
for discarded in discarded_blocks:
......
......@@ -5,7 +5,7 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
# 将每一个line中的span从左到右排序
......@@ -168,7 +168,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
'''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
# block_spans = remove_overlap_between_bbox(block_spans)
# block_spans = remove_overlap_between_bbox_for_span(block_spans)
block_dict['spans'] = block_spans
block_with_spans.append(block_dict)
......
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in
def _remove_overlap_between_bbox(spans):
def _remove_overlap_between_bbox_for_span(spans):
res = []
keeps = [True] * len(spans)
......@@ -34,7 +34,7 @@ def _remove_overlap_between_bbox(spans):
else:
mid = (ix0 + x1) // 2
ix0 = max(mid + 0.25, ix0)
x1 = min(mid -0.25, x1)
x1 = min(mid - 0.25, x1)
else:
if y1 >= iy1:
mid = (y0 + iy1) // 2
......@@ -51,5 +51,59 @@ def _remove_overlap_between_bbox(spans):
return res
def remove_overlap_between_bbox(spans):
return _remove_overlap_between_bbox(spans)
def _remove_overlap_between_bbox_for_block(all_bboxes):
res = []
keeps = [True] * len(all_bboxes)
for i in range(len(all_bboxes)):
for j in range(len(all_bboxes)):
if i == j:
continue
if _is_in(all_bboxes[i][:4], all_bboxes[j][:4]):
keeps[i] = False
for idx, v in enumerate(all_bboxes):
if not keeps[idx]:
continue
for i in range(len(res)):
if _is_in(v[:4], res[i][:4]):
continue
if _is_in_or_part_overlap(res[i][:4], v[:4]):
ix0, iy0, ix1, iy1 = res[i][:4]
x0, y0, x1, y1 = v[:4]
diff_x = min(x1, ix1) - max(x0, ix0)
diff_y = min(y1, iy1) - max(y0, iy0)
if diff_y > diff_x:
if x1 >= ix1:
mid = (x0 + ix1) // 2
ix1 = min(mid - 0.25, ix1)
x0 = max(mid + 0.25, x0)
else:
mid = (ix0 + x1) // 2
ix0 = max(mid + 0.25, ix0)
x1 = min(mid - 0.25, x1)
else:
if y1 >= iy1:
mid = (y0 + iy1) // 2
y0 = max(mid + 0.25, y0)
iy1 = min(iy1, mid-0.25)
else:
mid = (iy0 + y1) // 2
y1 = min(y1, mid-0.25)
iy0 = max(mid + 0.25, iy0)
res[i][:4] = [ix0, iy0, ix1, iy1]
v[:4] = [x0, y0, x1, y1]
res.append(v)
return res
def remove_overlap_between_bbox_for_span(spans):
return _remove_overlap_between_bbox_for_span(spans)
def remove_overlap_between_bbox_for_block(all_bboxes):
return _remove_overlap_between_bbox_for_block(all_bboxes)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment