Commit 85587b25 authored by 赵小蒙's avatar 赵小蒙

在dict中加入qa需要的字段

parent b560c18f
...@@ -22,20 +22,30 @@ from magic_pdf.pre_proc.detect_page_number import parse_pageNos ...@@ -22,20 +22,30 @@ from magic_pdf.pre_proc.detect_page_number import parse_pageNos
from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
from magic_pdf.pre_proc.ocr_dict_merge import ( from magic_pdf.pre_proc.ocr_dict_merge import (
merge_spans_to_line_by_layout, merge_spans_to_line_by_layout, merge_lines_to_block,
) )
from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \
adjust_bbox_for_standalone_block,modify_y_axis,modify_inline_equation adjust_bbox_for_standalone_block, modify_y_axis, modify_inline_equation, get_qa_need_list, \
remove_spans_by_bboxes_dict
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree): def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block):
return_dict = { return_dict = {
'preproc_blocks': blocks, 'preproc_blocks': blocks,
'layout_bboxes': layout_bboxes, 'layout_bboxes': layout_bboxes,
'page_idx': page_id, 'page_idx': page_id,
'page_size': [page_w, page_h], 'page_size': [page_w, page_h],
'_layout_tree': layout_tree, '_layout_tree': layout_tree,
'images': images,
'tables': tables,
'interline_equations': interline_equations,
'inline_equations': inline_equations,
'dropped_text_block': dropped_text_block,
'dropped_image_block': dropped_image_block,
'dropped_table_block': dropped_table_block,
} }
return return_dict return return_dict
...@@ -79,7 +89,6 @@ def parse_pdf_by_ocr( ...@@ -79,7 +89,6 @@ def parse_pdf_by_ocr(
start_time = time.time() start_time = time.time()
remove_bboxes = []
end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1 end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
for page_id in range(start_page_id, end_page_id + 1): for page_id in range(start_page_id, end_page_id + 1):
...@@ -111,11 +120,19 @@ def parse_pdf_by_ocr( ...@@ -111,11 +120,19 @@ def parse_pdf_by_ocr(
) )
# 构建需要remove的bbox列表 # 构建需要remove的bbox列表
need_remove_spans_bboxes = [] # need_remove_spans_bboxes = []
need_remove_spans_bboxes.extend(page_no_bboxes) # need_remove_spans_bboxes.extend(page_no_bboxes)
need_remove_spans_bboxes.extend(header_bboxes) # need_remove_spans_bboxes.extend(header_bboxes)
need_remove_spans_bboxes.extend(footer_bboxes) # need_remove_spans_bboxes.extend(footer_bboxes)
need_remove_spans_bboxes.extend(footnote_bboxes) # need_remove_spans_bboxes.extend(footnote_bboxes)
# 构建需要remove的bbox字典
need_remove_spans_bboxes_dict = {
"page_no": page_no_bboxes,
"header": header_bboxes,
"footer": footer_bboxes,
"footnote": footnote_bboxes,
}
layout_dets = ocr_page_info["layout_dets"] layout_dets = ocr_page_info["layout_dets"]
spans = [] spans = []
...@@ -177,7 +194,9 @@ def parse_pdf_by_ocr( ...@@ -177,7 +194,9 @@ def parse_pdf_by_ocr(
spans = remove_overlaps_min_spans(spans) spans = remove_overlaps_min_spans(spans)
# 删除remove_span_block_bboxes中的bbox # 删除remove_span_block_bboxes中的bbox
spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes) # spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
# 按qa要求,增加drop相关数据
spans, dropped_text_block, dropped_image_block, dropped_table_block = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
# 对image和table截图 # 对image和table截图
spans = cut_image_and_table(spans, page, page_id, book_name, save_path) spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
...@@ -202,18 +221,19 @@ def parse_pdf_by_ocr( ...@@ -202,18 +221,19 @@ def parse_pdf_by_ocr(
# 将spans合并成line(在layout内,从上到下,从左到右) # 将spans合并成line(在layout内,从上到下,从左到右)
lines = merge_spans_to_line_by_layout(spans, layout_bboxes) lines = merge_spans_to_line_by_layout(spans, layout_bboxes)
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox # 将lines合并成block
blocks = [] blocks = merge_lines_to_block(lines)
for line in lines:
blocks.append( # 根据block合并段落
{
"bbox": line["bbox"],
"lines": [line], # 获取QA需要外置的list
} images, tables, interline_equations, inline_equations = get_qa_need_list(blocks)
)
# 构造pdf_info_dict # 构造pdf_info_dict
page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree) page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block)
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f"page_{page_id}"] = page_info
# 在测试时,保存调试信息 # 在测试时,保存调试信息
......
...@@ -80,6 +80,19 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes): ...@@ -80,6 +80,19 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
return lines return lines
def merge_lines_to_block(lines):
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks = []
for line in lines:
blocks.append(
{
"bbox": line["bbox"],
"lines": [line],
}
)
return blocks
......
from loguru import logger
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \ from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
__is_overlaps_y_exceeds_threshold __is_overlaps_y_exceeds_threshold
...@@ -31,6 +33,32 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes): ...@@ -31,6 +33,32 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
return spans return spans
def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
dropped_text_block = []
dropped_image_block = []
dropped_table_block = []
for key, value in need_remove_spans_bboxes_dict.items():
# logger.info(f"remove spans by bbox dict, key: {key}, value: {value}")
need_remove_spans = []
for span in spans:
for removed_bbox in value:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
need_remove_spans.append(span)
break
for span in need_remove_spans:
spans.remove(span)
span['tag'] = key
if span['type'] in ['text', 'inline_equation', 'displayed_equation']:
dropped_text_block.append(span)
elif span['type'] == 'image':
dropped_image_block.append(span)
elif span['type'] == 'table':
dropped_table_block.append(span)
return spans, dropped_text_block, dropped_image_block, dropped_table_block
def adjust_bbox_for_standalone_block(spans): def adjust_bbox_for_standalone_block(spans):
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0 # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
for sb_span in spans: for sb_span in spans:
...@@ -46,7 +74,6 @@ def adjust_bbox_for_standalone_block(spans): ...@@ -46,7 +74,6 @@ def adjust_bbox_for_standalone_block(spans):
return spans return spans
def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list): def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
# displayed_list = [] # displayed_list = []
...@@ -105,8 +132,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list): ...@@ -105,8 +132,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
current_line = line[0] current_line = line[0]
current_line.sort(key=lambda span: span['bbox'][0]) current_line.sort(key=lambda span: span['bbox'][0])
# 调整每一个文字行内bbox统一
#调整每一个文字行内bbox统一
for line in text_inline_lines: for line in text_inline_lines:
current_line, (line_first_y0, line_first_y) = line current_line, (line_first_y0, line_first_y) = line
for span in current_line: for span in current_line:
...@@ -115,8 +141,9 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list): ...@@ -115,8 +141,9 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
# return spans, displayed_list, text_inline_lines # return spans, displayed_list, text_inline_lines
def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list): def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
#错误行间公式转行内公式 # 错误行间公式转行内公式
j = 0 j = 0
for i in range(len(displayed_list)): for i in range(len(displayed_list)):
# if i == 8: # if i == 8:
...@@ -127,26 +154,53 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: ...@@ -127,26 +154,53 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
while j < len(text_inline_lines): while j < len(text_inline_lines):
text_line = text_inline_lines[j] text_line = text_inline_lines[j]
y0, y1 = text_line[1] y0, y1 = text_line[1]
if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)): if (
span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(
span['bbox'], (0, y0, 0, y1)):
#调整公式类型 # 调整公式类型
if span["type"] == "displayed_equation": if span["type"] == "displayed_equation":
#最后一行是行间公式 # 最后一行是行间公式
if j+1 >= len(text_inline_lines): if j + 1 >= len(text_inline_lines):
span["type"] = "inline_equation" span["type"] = "inline_equation"
span["bbox"][1] = y0 span["bbox"][1] = y0
span["bbox"][3] = y1 span["bbox"][3] = y1
else: else:
#行间公式旁边有多行文字或者行间公式比文字高3倍则不转换 # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
y0_next, y1_next = text_inline_lines[j + 1][1] y0_next, y1_next = text_inline_lines[j + 1][1]
if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3*(y1-y0) > span_y - span_y0: if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
y1 - y0) > span_y - span_y0:
span["type"] = "inline_equation" span["type"] = "inline_equation"
span["bbox"][1] = y0 span["bbox"][1] = y0
span["bbox"][3] = y1 span["bbox"][3] = y1
break break
elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)): elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'],
(0, y0, 0, y1)):
break break
else: else:
j += 1 j += 1
return spans return spans
def get_qa_need_list(blocks):
# 创建 images, tables, interline_equations, inline_equations 的副本
images = []
tables = []
interline_equations = []
inline_equations = []
for block in blocks:
for line in block["lines"]:
for span in line["spans"]:
if span["type"] == "image":
images.append(span)
elif span["type"] == "table":
tables.append(span)
elif span["type"] == "inline_equation":
inline_equations.append(span)
elif span["type"] == "displayed_equation":
interline_equations.append(span)
else:
continue
return images, tables, interline_equations, inline_equations
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment