在dict中加入qa需要的字段

85587b25 · 赵小蒙 · b560c18f · 85587b25 · 85587b25 · 85587b25
Commit 85587b25 authored Mar 13, 2024 by 赵小蒙
Showing with 119 additions and 32 deletions

pdf_parse_by_ocr.py magic_pdf/pdf_parse_by_ocr.py +40 -20

ocr_dict_merge.py magic_pdf/pre_proc/ocr_dict_merge.py +13 -0

ocr_span_list_modify.py magic_pdf/pre_proc/ocr_span_list_modify.py +66 -12

No files found.
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
@@ -22,20 +22,30 @@ from magic_pdf.pre_proc.detect_page_number import parse_pageNos
 from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
 from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
 from magic_pdf.pre_proc.ocr_dict_merge import (
-    merge_spans_to_line_by_layout,
+    merge_spans_to_line_by_layout, merge_lines_to_block,
 )
 from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \
-    adjust_bbox_for_standalone_block,modify_y_axis,modify_inline_equation
+    adjust_bbox_for_standalone_block, modify_y_axis, modify_inline_equation, get_qa_need_list, \
+    remove_spans_by_bboxes_dict
 from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox


-def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree):
+def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                             images, tables, interline_equations, inline_equations,
+                             dropped_text_block, dropped_image_block, dropped_table_block):
    return_dict = {
        'preproc_blocks': blocks,
        'layout_bboxes': layout_bboxes,
        'page_idx': page_id,
        'page_size': [page_w, page_h],
        '_layout_tree': layout_tree,
+        'images': images,
+        'tables': tables,
+        'interline_equations': interline_equations,
+        'inline_equations': inline_equations,
+        'dropped_text_block': dropped_text_block,
+        'dropped_image_block': dropped_image_block,
+        'dropped_table_block': dropped_table_block,
    }
    return return_dict

@@ -79,7 +89,6 @@ def parse_pdf_by_ocr(

    start_time = time.time()

-    remove_bboxes = []

    end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
    for page_id in range(start_page_id, end_page_id + 1):
@@ -111,11 +120,19 @@ def parse_pdf_by_ocr(
        )

        # 构建需要remove的bbox列表
-        need_remove_spans_bboxes = []
-        need_remove_spans_bboxes.extend(page_no_bboxes)
-        need_remove_spans_bboxes.extend(header_bboxes)
-        need_remove_spans_bboxes.extend(footer_bboxes)
-        need_remove_spans_bboxes.extend(footnote_bboxes)
+        # need_remove_spans_bboxes = []
+        # need_remove_spans_bboxes.extend(page_no_bboxes)
+        # need_remove_spans_bboxes.extend(header_bboxes)
+        # need_remove_spans_bboxes.extend(footer_bboxes)
+        # need_remove_spans_bboxes.extend(footnote_bboxes)
+
+        # 构建需要remove的bbox字典
+        need_remove_spans_bboxes_dict = {
+            "page_no": page_no_bboxes,
+            "header": header_bboxes,
+            "footer": footer_bboxes,
+            "footnote": footnote_bboxes,
+        }

        layout_dets = ocr_page_info["layout_dets"]
        spans = []
@@ -177,7 +194,9 @@ def parse_pdf_by_ocr(
        spans = remove_overlaps_min_spans(spans)

        # 删除remove_span_block_bboxes中的bbox
-        spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
+        # spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
+        # 按qa要求，增加drop相关数据
+        spans, dropped_text_block, dropped_image_block, dropped_table_block = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)

        # 对image和table截图
        spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
@@ -202,18 +221,19 @@ def parse_pdf_by_ocr(
        # 将spans合并成line(在layout内,从上到下,从左到右)
        lines = merge_spans_to_line_by_layout(spans, layout_bboxes)

-        # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
-        blocks = []
-        for line in lines:
-            blocks.append(
-                {
-                    "bbox": line["bbox"],
-                    "lines": [line],
-                }
-            )
+        # 将lines合并成block
+        blocks = merge_lines_to_block(lines)
+
+        # 根据block合并段落
+
+
+        # 获取QA需要外置的list
+        images, tables, interline_equations, inline_equations = get_qa_need_list(blocks)

        # 构造pdf_info_dict
-        page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree)
+        page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                                             images, tables, interline_equations, inline_equations,
+                                             dropped_text_block, dropped_image_block, dropped_table_block)
        pdf_info_dict[f"page_{page_id}"] = page_info

    # 在测试时,保存调试信息

--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
@@ -80,6 +80,19 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
    return lines


+def merge_lines_to_block(lines):
+    # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
+    blocks = []
+    for line in lines:
+        blocks.append(
+            {
+                "bbox": line["bbox"],
+                "lines": [line],
+            }
+        )
+    return blocks
+
+




--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
+from loguru import logger
+
 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
    __is_overlaps_y_exceeds_threshold

@@ -31,6 +33,32 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
    return spans


+def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
+    dropped_text_block = []
+    dropped_image_block = []
+    dropped_table_block = []
+    for key, value in need_remove_spans_bboxes_dict.items():
+        # logger.info(f"remove spans by bbox dict, key: {key}, value: {value}")
+        need_remove_spans = []
+        for span in spans:
+            for removed_bbox in value:
+                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
+                    need_remove_spans.append(span)
+                    break
+
+        for span in need_remove_spans:
+            spans.remove(span)
+            span['tag'] = key
+            if span['type'] in ['text', 'inline_equation', 'displayed_equation']:
+                dropped_text_block.append(span)
+            elif span['type'] == 'image':
+                dropped_image_block.append(span)
+            elif span['type'] == 'table':
+                dropped_table_block.append(span)
+
+    return spans, dropped_text_block, dropped_image_block, dropped_table_block
+
+
 def adjust_bbox_for_standalone_block(spans):
    # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
    for sb_span in spans:
@@ -46,7 +74,6 @@ def adjust_bbox_for_standalone_block(spans):
    return spans


-
 def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
    # displayed_list = []

@@ -105,8 +132,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
        current_line = line[0]
        current_line.sort(key=lambda span: span['bbox'][0])

-
-    #调整每一个文字行内bbox统一
+    # 调整每一个文字行内bbox统一
    for line in text_inline_lines:
        current_line, (line_first_y0, line_first_y) = line
        for span in current_line:
@@ -115,8 +141,9 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):

    # return spans, displayed_list, text_inline_lines

+
 def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
-    #错误行间公式转行内公式
+    # 错误行间公式转行内公式
    j = 0
    for i in range(len(displayed_list)):
        # if i == 8:
@@ -127,26 +154,53 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
        while j < len(text_inline_lines):
            text_line = text_inline_lines[j]
            y0, y1 = text_line[1]
-            if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
+            if (
+                    span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(
+                    span['bbox'], (0, y0, 0, y1)):

-                #调整公式类型
+                # 调整公式类型
                if span["type"] == "displayed_equation":
-                    #最后一行是行间公式
-                    if j+1 >= len(text_inline_lines):
+                    # 最后一行是行间公式
+                    if j + 1 >= len(text_inline_lines):
                        span["type"] = "inline_equation"
                        span["bbox"][1] = y0
                        span["bbox"][3] = y1
                    else:
-                        #行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
+                        # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
                        y0_next, y1_next = text_inline_lines[j + 1][1]
-                        if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3*(y1-y0) > span_y - span_y0:
+                        if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
+                                y1 - y0) > span_y - span_y0:
                            span["type"] = "inline_equation"
                            span["bbox"][1] = y0
                            span["bbox"][3] = y1
                break
-            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
+            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'],
+                                                                                                       (0, y0, 0, y1)):
                break
            else:
                j += 1

-    return spans
\ No newline at end of file
+    return spans
+
+
+def get_qa_need_list(blocks):
+    # 创建 images, tables, interline_equations, inline_equations 的副本
+    images = []
+    tables = []
+    interline_equations = []
+    inline_equations = []
+
+    for block in blocks:
+        for line in block["lines"]:
+            for span in line["spans"]:
+                if span["type"] == "image":
+                    images.append(span)
+                elif span["type"] == "table":
+                    tables.append(span)
+                elif span["type"] == "inline_equation":
+                    inline_equations.append(span)
+                elif span["type"] == "displayed_equation":
+                    interline_equations.append(span)
+                else:
+                    continue
+    return images, tables, interline_equations, inline_equations