Commit 5eab010b authored by 赵小蒙's avatar 赵小蒙

ocr模式对所有drop的span记录tag并分类

parent f5bfaaf6
COLOR_BG_HEADER_TXT_BLOCK = "color_background_header_txt_block" COLOR_BG_HEADER_TXT_BLOCK = "color_background_header_txt_block"
class DropTag:
PAGE_NUMBER = "page_no"
HEADER = "header"
FOOTER = "footer"
FOOTNOTE = "footnote"
NOT_IN_LAYOUT = "not_in_layout"
SPAN_OVERLAP = "span_overlap"
...@@ -14,6 +14,7 @@ from magic_pdf.libs.commons import ( ...@@ -14,6 +14,7 @@ from magic_pdf.libs.commons import (
get_docx_model_output, get_docx_model_output,
) )
from magic_pdf.libs.coordinate_transform import get_scale_ratio from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.safe_filename import sanitize_filename from magic_pdf.libs.safe_filename import sanitize_filename
from magic_pdf.para.para_split import para_split from magic_pdf.para.para_split import para_split
...@@ -34,7 +35,7 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox ...@@ -34,7 +35,7 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
def construct_page_component(blocks, para_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, def construct_page_component(blocks, para_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, inline_equations, images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block, dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
need_remove_spans_bboxes_dict): need_remove_spans_bboxes_dict):
return_dict = { return_dict = {
'preproc_blocks': blocks, 'preproc_blocks': blocks,
...@@ -50,6 +51,7 @@ def construct_page_component(blocks, para_blocks, layout_bboxes, page_id, page_w ...@@ -50,6 +51,7 @@ def construct_page_component(blocks, para_blocks, layout_bboxes, page_id, page_w
'droped_text_block': dropped_text_block, 'droped_text_block': dropped_text_block,
'droped_image_block': dropped_image_block, 'droped_image_block': dropped_image_block,
'droped_table_block': dropped_table_block, 'droped_table_block': dropped_table_block,
'dropped_equation_block': dropped_equation_block,
'droped_bboxes': need_remove_spans_bboxes_dict, 'droped_bboxes': need_remove_spans_bboxes_dict,
} }
return return_dict return return_dict
...@@ -133,10 +135,10 @@ def parse_pdf_by_ocr( ...@@ -133,10 +135,10 @@ def parse_pdf_by_ocr(
# 构建需要remove的bbox字典 # 构建需要remove的bbox字典
need_remove_spans_bboxes_dict = { need_remove_spans_bboxes_dict = {
"page_no": page_no_bboxes, DropTag.PAGE_NUMBER: page_no_bboxes,
"header": header_bboxes, DropTag.HEADER: header_bboxes,
"footer": footer_bboxes, DropTag.FOOTER: footer_bboxes,
"footnote": footnote_bboxes, DropTag.FOOTNOTE: footnote_bboxes,
} }
layout_dets = ocr_page_info["layout_dets"] layout_dets = ocr_page_info["layout_dets"]
...@@ -202,12 +204,12 @@ def parse_pdf_by_ocr( ...@@ -202,12 +204,12 @@ def parse_pdf_by_ocr(
# 删除重叠spans中较小的那些 # 删除重叠spans中较小的那些
spans = remove_overlaps_min_spans(spans) spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
# 删除remove_span_block_bboxes中的bbox # 删除remove_span_block_bboxes中的bbox
# spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes) # spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
# 按qa要求,增加drop相关数据 # 按qa要求,增加drop相关数据
spans, dropped_text_block, dropped_image_block, dropped_table_block = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict) spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
# 对image和table截图 # 对image和table截图
spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client) spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
...@@ -230,7 +232,7 @@ def parse_pdf_by_ocr( ...@@ -230,7 +232,7 @@ def parse_pdf_by_ocr(
layout_bboxes, layout_tree = layout_detect(ocr_page_info['subfield_dets'], page, ocr_page_info) layout_bboxes, layout_tree = layout_detect(ocr_page_info['subfield_dets'], page, ocr_page_info)
# 将spans合并成line(在layout内,从上到下,从左到右) # 将spans合并成line(在layout内,从上到下,从左到右)
lines = merge_spans_to_line_by_layout(spans, layout_bboxes) lines, dropped_spans_by_layout = merge_spans_to_line_by_layout(spans, layout_bboxes)
# 将lines合并成block # 将lines合并成block
blocks = merge_lines_to_block(lines) blocks = merge_lines_to_block(lines)
...@@ -241,10 +243,33 @@ def parse_pdf_by_ocr( ...@@ -241,10 +243,33 @@ def parse_pdf_by_ocr(
# 获取QA需要外置的list # 获取QA需要外置的list
images, tables, interline_equations, inline_equations = get_qa_need_list(blocks) images, tables, interline_equations, inline_equations = get_qa_need_list(blocks)
# drop的span_list合并
dropped_spans = []
dropped_spans.extend(dropped_spans_by_span_overlap)
dropped_spans.extend(dropped_spans_by_removed_bboxes)
dropped_spans.extend(dropped_spans_by_layout)
dropped_text_block = []
dropped_image_block = []
dropped_table_block = []
dropped_equation_block = []
for span in dropped_spans:
# drop出的spans进行分类
if span['type'] == ContentType.Text:
dropped_text_block.append(span)
elif span['type'] == ContentType.Image:
dropped_image_block.append(span)
elif span['type'] == ContentType.Table:
dropped_table_block.append(span)
elif span['type'] in [ContentType.InlineEquation, ContentType.InterlineEquation]:
dropped_equation_block.append(span)
# 构造pdf_info_dict # 构造pdf_info_dict
page_info = construct_page_component(blocks, para_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, page_info = construct_page_component(blocks, para_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, inline_equations, images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block, dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
need_remove_spans_bboxes_dict) need_remove_spans_bboxes_dict)
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f"page_{page_id}"] = page_info
......
...@@ -2,6 +2,7 @@ from loguru import logger ...@@ -2,6 +2,7 @@ from loguru import logger
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
calculate_overlap_area_in_bbox1_area_ratio calculate_overlap_area_in_bbox1_area_ratio
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType from magic_pdf.libs.ocr_content_type import ContentType
...@@ -59,6 +60,7 @@ def merge_spans_to_line(spans): ...@@ -59,6 +60,7 @@ def merge_spans_to_line(spans):
def merge_spans_to_line_by_layout(spans, layout_bboxes): def merge_spans_to_line_by_layout(spans, layout_bboxes):
lines = [] lines = []
new_spans = [] new_spans = []
dropped_spans = []
for item in layout_bboxes: for item in layout_bboxes:
layout_bbox = item['layout_bbox'] layout_bbox = item['layout_bbox']
# 遍历spans,将每个span放入对应的layout中 # 遍历spans,将每个span放入对应的layout中
...@@ -78,10 +80,14 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes): ...@@ -78,10 +80,14 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
layout_lines = merge_spans_to_line(layout_sapns) layout_lines = merge_spans_to_line(layout_sapns)
lines.extend(layout_lines) lines.extend(layout_lines)
#对line中的span进行排序 # 对line中的span进行排序
lines = line_sort_spans_by_left_to_right(lines) lines = line_sort_spans_by_left_to_right(lines)
return lines for span in spans:
span['tag'] = DropTag.NOT_IN_LAYOUT
dropped_spans.append(span)
return lines, dropped_spans
def merge_lines_to_block(lines): def merge_lines_to_block(lines):
......
...@@ -2,10 +2,12 @@ from loguru import logger ...@@ -2,10 +2,12 @@ from loguru import logger
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \ from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
__is_overlaps_y_exceeds_threshold __is_overlaps_y_exceeds_threshold
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType from magic_pdf.libs.ocr_content_type import ContentType
def remove_overlaps_min_spans(spans): def remove_overlaps_min_spans(spans):
dropped_spans = []
# 删除重叠spans中较小的那些 # 删除重叠spans中较小的那些
for span1 in spans.copy(): for span1 in spans.copy():
for span2 in spans.copy(): for span2 in spans.copy():
...@@ -15,7 +17,9 @@ def remove_overlaps_min_spans(spans): ...@@ -15,7 +17,9 @@ def remove_overlaps_min_spans(spans):
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None) bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if bbox_to_remove is not None: if bbox_to_remove is not None:
spans.remove(bbox_to_remove) spans.remove(bbox_to_remove)
return spans bbox_to_remove['tag'] = DropTag.SPAN_OVERLAP
dropped_spans.append(bbox_to_remove)
return spans, dropped_spans
def remove_spans_by_bboxes(spans, need_remove_spans_bboxes): def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
...@@ -35,9 +39,7 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes): ...@@ -35,9 +39,7 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict): def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
dropped_text_block = [] dropped_spans = []
dropped_image_block = []
dropped_table_block = []
for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items(): for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
# logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}") # logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
need_remove_spans = [] need_remove_spans = []
...@@ -50,14 +52,9 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict): ...@@ -50,14 +52,9 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
for span in need_remove_spans: for span in need_remove_spans:
spans.remove(span) spans.remove(span)
span['tag'] = drop_tag span['tag'] = drop_tag
if span['type'] in [ContentType.Text, ContentType.InlineEquation, ContentType.InterlineEquation]: dropped_spans.append(span)
dropped_text_block.append(span)
elif span['type'] == ContentType.Image:
dropped_image_block.append(span)
elif span['type'] == ContentType.Table:
dropped_table_block.append(span)
return spans, dropped_text_block, dropped_image_block, dropped_table_block return spans, dropped_spans
def adjust_bbox_for_standalone_block(spans): def adjust_bbox_for_standalone_block(spans):
...@@ -98,7 +95,8 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list): ...@@ -98,7 +95,8 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation" # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上 # image和table类型,同上
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any( if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line): s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
current_line):
# 传入 # 传入
if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]: if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
displayed_list.append(span) displayed_list.append(span)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment