Unverified Commit 0fb9619a authored by Kaiwen Liu's avatar Kaiwen Liu Committed by GitHub

Merge branch 'magicpdf:master' into master

parents 8c3a37ff eebd9767
...@@ -24,7 +24,7 @@ from magic_pdf.pre_proc.ocr_dict_merge import ( ...@@ -24,7 +24,7 @@ from magic_pdf.pre_proc.ocr_dict_merge import (
from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \
adjust_bbox_for_standalone_block, modify_y_axis, modify_inline_equation, get_qa_need_list, \ adjust_bbox_for_standalone_block, modify_y_axis, modify_inline_equation, get_qa_need_list, \
remove_spans_by_bboxes_dict remove_spans_by_bboxes_dict
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
def parse_pdf_by_ocr( def parse_pdf_by_ocr(
...@@ -158,8 +158,7 @@ def parse_pdf_by_ocr( ...@@ -158,8 +158,7 @@ def parse_pdf_by_ocr(
spans = modify_inline_equation(spans, displayed_list, text_inline_lines) spans = modify_inline_equation(spans, displayed_list, text_inline_lines)
'''bbox去除粘连''' '''bbox去除粘连'''
spans = remove_overlap_between_bbox(spans) spans = remove_overlap_between_bbox_for_span(spans)
''' '''
对tpye=["interline_equation", "image", "table"]进行额外处理, 对tpye=["interline_equation", "image", "table"]进行额外处理,
如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0 如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
......
...@@ -20,7 +20,6 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo ...@@ -20,7 +20,6 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans, \ from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans, \
fix_discarded_block fix_discarded_block
from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2 from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
...@@ -98,7 +97,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, ...@@ -98,7 +97,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
img_blocks = magic_model.get_imgs(page_id) img_blocks = magic_model.get_imgs(page_id)
table_blocks = magic_model.get_tables(page_id) table_blocks = magic_model.get_tables(page_id)
discarded_blocks = magic_model.get_discarded(page_id) discarded_blocks = magic_model.get_discarded(page_id)
text_blocks = remove_overlap_between_bbox(magic_model.get_text_blocks(page_id)) text_blocks = magic_model.get_text_blocks(page_id)
title_blocks = magic_model.get_title_blocks(page_id) title_blocks = magic_model.get_title_blocks(page_id)
inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id) inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id)
......
...@@ -2,6 +2,7 @@ from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_ove ...@@ -2,6 +2,7 @@ from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_ove
calculate_iou calculate_iou
from magic_pdf.libs.drop_tag import DropTag from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import BlockType from magic_pdf.libs.ocr_content_type import BlockType
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks, def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
...@@ -35,6 +36,8 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc ...@@ -35,6 +36,8 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks) all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
'''经过以上处理后,还存在大框套小框的情况,则删除小框''' '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes = remove_overlaps_min_blocks(all_bboxes) all_bboxes = remove_overlaps_min_blocks(all_bboxes)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes = remove_overlap_between_bbox_for_block(all_bboxes)
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)''' '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
for discarded in discarded_blocks: for discarded in discarded_blocks:
......
...@@ -5,7 +5,7 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox ...@@ -5,7 +5,7 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox
from magic_pdf.libs.drop_tag import DropTag from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType, BlockType from magic_pdf.libs.ocr_content_type import ContentType, BlockType
from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
# 将每一个line中的span从左到右排序 # 将每一个line中的span从左到右排序
...@@ -168,7 +168,7 @@ def fill_spans_in_blocks(blocks, spans, radio): ...@@ -168,7 +168,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines) block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
'''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错 '''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
# block_spans = remove_overlap_between_bbox(block_spans) # block_spans = remove_overlap_between_bbox_for_span(block_spans)
block_dict['spans'] = block_spans block_dict['spans'] = block_spans
block_with_spans.append(block_dict) block_with_spans.append(block_dict)
......
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in
def _remove_overlap_between_bbox(spans): def _remove_overlap_between_bbox_for_span(spans):
res = [] res = []
keeps = [True] * len(spans) keeps = [True] * len(spans)
...@@ -34,7 +34,7 @@ def _remove_overlap_between_bbox(spans): ...@@ -34,7 +34,7 @@ def _remove_overlap_between_bbox(spans):
else: else:
mid = (ix0 + x1) // 2 mid = (ix0 + x1) // 2
ix0 = max(mid + 0.25, ix0) ix0 = max(mid + 0.25, ix0)
x1 = min(mid -0.25, x1) x1 = min(mid - 0.25, x1)
else: else:
if y1 >= iy1: if y1 >= iy1:
mid = (y0 + iy1) // 2 mid = (y0 + iy1) // 2
...@@ -51,5 +51,59 @@ def _remove_overlap_between_bbox(spans): ...@@ -51,5 +51,59 @@ def _remove_overlap_between_bbox(spans):
return res return res
def remove_overlap_between_bbox(spans): def _remove_overlap_between_bbox_for_block(all_bboxes):
return _remove_overlap_between_bbox(spans) res = []
keeps = [True] * len(all_bboxes)
for i in range(len(all_bboxes)):
for j in range(len(all_bboxes)):
if i == j:
continue
if _is_in(all_bboxes[i][:4], all_bboxes[j][:4]):
keeps[i] = False
for idx, v in enumerate(all_bboxes):
if not keeps[idx]:
continue
for i in range(len(res)):
if _is_in(v[:4], res[i][:4]):
continue
if _is_in_or_part_overlap(res[i][:4], v[:4]):
ix0, iy0, ix1, iy1 = res[i][:4]
x0, y0, x1, y1 = v[:4]
diff_x = min(x1, ix1) - max(x0, ix0)
diff_y = min(y1, iy1) - max(y0, iy0)
if diff_y > diff_x:
if x1 >= ix1:
mid = (x0 + ix1) // 2
ix1 = min(mid - 0.25, ix1)
x0 = max(mid + 0.25, x0)
else:
mid = (ix0 + x1) // 2
ix0 = max(mid + 0.25, ix0)
x1 = min(mid - 0.25, x1)
else:
if y1 >= iy1:
mid = (y0 + iy1) // 2
y0 = max(mid + 0.25, y0)
iy1 = min(iy1, mid-0.25)
else:
mid = (iy0 + y1) // 2
y1 = min(y1, mid-0.25)
iy0 = max(mid + 0.25, iy0)
res[i][:4] = [ix0, iy0, ix1, iy1]
v[:4] = [x0, y0, x1, y1]
res.append(v)
return res
def remove_overlap_between_bbox_for_span(spans):
return _remove_overlap_between_bbox_for_span(spans)
def remove_overlap_between_bbox_for_block(all_bboxes):
return _remove_overlap_between_bbox_for_block(all_bboxes)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment