Merge pull request #2 from icecraft/feat/proc_bbox

Feat/proc bbox

Merge pull request #2 from icecraft/feat/proc_bbox
Feat/proc bbox
59d817b1 · myhloli · GitHub · 070139a5 · 2611e853 · 59d817b1
Unverified Commit 59d817b1 authored Mar 12, 2024 by myhloli Committed by GitHub Mar 12, 2024
Show whitespace changes
Inline Side-by-side

Showing with 109 additions and 42 deletions

pdf_parse_by_ocr.py magic_pdf/pdf_parse_by_ocr.py +66 -42

remove_bbox_overlap.py magic_pdf/pre_proc/remove_bbox_overlap.py +43 -0

No files found.
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
@@ -5,7 +5,14 @@ import time
 from loguru import logger

 from demo.draw_bbox import draw_layout_bbox, draw_text_bbox
-from magic_pdf.libs.commons import read_file, join_path, fitz, get_img_s3_client, get_delta_time, get_docx_model_output
+from magic_pdf.libs.commons import (
+    read_file,
+    join_path,
+    fitz,
+    get_img_s3_client,
+    get_delta_time,
+    get_docx_model_output,
+)
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.safe_filename import sanitize_filename
 from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
@@ -14,8 +21,12 @@ from magic_pdf.pre_proc.detect_header import parse_headers
 from magic_pdf.pre_proc.detect_page_number import parse_pageNos
 from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
 from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
-from magic_pdf.pre_proc.ocr_dict_merge import remove_overlaps_min_spans, merge_spans_to_line_by_layout
+from magic_pdf.pre_proc.ocr_dict_merge import (
+    remove_overlaps_min_spans,
+    merge_spans_to_line_by_layout,
+)
 from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes
+from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox


 def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree):
@@ -81,18 +92,23 @@ def parse_pdf_by_ocr(

        if debug_mode:
            time_now = time.time()
-            logger.info(f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}")
+            logger.info(
+                f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
+            )
            start_time = time_now

        # 获取当前页的模型数据
-        ocr_page_info = get_docx_model_output(pdf_model_output, pdf_model_profile, page_id)
+        ocr_page_info = get_docx_model_output(
+            pdf_model_output, pdf_model_profile, page_id
+        )

        """从json中获取每页的页码、页眉、页脚的bbox"""
        page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info)
        header_bboxes = parse_headers(page_id, page, ocr_page_info)
        footer_bboxes = parse_footers(page_id, page, ocr_page_info)
-        footnote_bboxes = parse_footnotes_by_model(page_id, page, ocr_page_info, md_bookname_save_path,
-                                                   debug_mode=debug_mode)
+        footnote_bboxes = parse_footnotes_by_model(
+            page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode
+        )

        # 构建需要remove的bbox列表
        need_remove_spans_bboxes = []
@@ -101,51 +117,57 @@ def parse_pdf_by_ocr(
        need_remove_spans_bboxes.extend(footer_bboxes)
        need_remove_spans_bboxes.extend(footnote_bboxes)

-        layout_dets = ocr_page_info['layout_dets']
+        layout_dets = ocr_page_info["layout_dets"]
        spans = []

        # 计算模型坐标和pymu坐标的缩放比例
-        horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
+        horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
+            ocr_page_info, page
+        )

        for layout_det in layout_dets:
-            category_id = layout_det['category_id']
+            category_id = layout_det["category_id"]
            allow_category_id_list = [1, 7, 13, 14, 15]
            if category_id in allow_category_id_list:
-                x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
-                bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
-                        int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
-                '''要删除的'''
+                x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
+                bbox = [
+                    int(x0 / horizontal_scale_ratio),
+                    int(y0 / vertical_scale_ratio),
+                    int(x1 / horizontal_scale_ratio),
+                    int(y1 / vertical_scale_ratio),
+                ]
+                """要删除的"""
                #  3: 'header',      # 页眉
                #  4: 'page number', # 页码
                #  5: 'footnote',    # 脚注
                #  6: 'footer',      # 页脚
-                '''当成span拼接的'''
+                """当成span拼接的"""
                #  1: 'image', # 图片
                #  7: 'table',       # 表格
                #  13: 'inline_equation',     # 行内公式
                #  14: 'displayed_equation',      # 行间公式
                #  15: 'text',      # ocr识别文本
-                '''layout信息'''
+                """layout信息"""
                #  11: 'full column',   # 单栏
                #  12: 'sub column',    # 多栏
                span = {
-                    'bbox': bbox,
+                    "bbox": bbox,
                }
                if category_id == 1:
-                    span['type'] = 'image'
+                    span["type"] = "image"

                elif category_id == 7:
-                    span['type'] = 'table'
+                    span["type"] = "table"

                elif category_id == 13:
-                    span['content'] = layout_det['latex']
-                    span['type'] = 'inline_equation'
+                    span["content"] = layout_det["latex"]
+                    span["type"] = "inline_equation"
                elif category_id == 14:
-                    span['content'] = layout_det['latex']
-                    span['type'] = 'displayed_equation'
+                    span["content"] = layout_det["latex"]
+                    span["type"] = "displayed_equation"
                elif category_id == 15:
-                    span['content'] = layout_det['text']
-                    span['type'] = 'text'
+                    span["content"] = layout_det["text"]
+                    span["type"] = "text"
                # print(span)
                spans.append(span)
            else:
@@ -160,12 +182,12 @@ def parse_pdf_by_ocr(
        # 对image和table截图
        spans = cut_image_and_table(spans, page, page_id, book_name, save_path)

-
        # 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)

        # 模型识别错误的行间公式, type类型转换成行内公式

        # bbox去除粘连
+        spans = remove_overlap_between_bbox(spans)

        # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0

@@ -175,14 +197,15 @@ def parse_pdf_by_ocr(
        # 将spans合并成line(在layout内,从上到下,从左到右)
        lines = merge_spans_to_line_by_layout(spans, layout_bboxes)

-
        # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
        blocks = []
        for line in lines:
-            blocks.append({
-                "bbox": line['bbox'],
+            blocks.append(
+                {
+                    "bbox": line["bbox"],
                    "lines": [line],
-            })
+                }
+            )

        # 构造pdf_info_dict
        page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree)
@@ -190,7 +213,9 @@ def parse_pdf_by_ocr(

    # 在测试时,保存调试信息
    if debug_mode:
-        params_file_save_path = join_path(save_tmp_path, "md", book_name, "preproc_out.json")
+        params_file_save_path = join_path(
+            save_tmp_path, "md", book_name, "preproc_out.json"
+        )
        with open(params_file_save_path, "w", encoding="utf-8") as f:
            json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)

@@ -198,5 +223,4 @@ def parse_pdf_by_ocr(
        draw_layout_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
        draw_text_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)

-
    return pdf_info_dict
--- a/magic_pdf/pre_proc/remove_bbox_overlap.py
+++ b/magic_pdf/pre_proc/remove_bbox_overlap.py
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in
+
+
+def _remove_overlap_between_bbox(spans):
+    res = []
+    for v in spans:
+        for i in range(len(res)):
+            if _is_in(res[i]["bbox"], v["bbox"]):
+                continue
+            if _is_in_or_part_overlap(res[i]["bbox"], v["bbox"]):
+                ix0, iy0, ix1, iy1 = res[i]["bbox"]
+                x0, y0, x1, y1 = v["bbox"]
+
+                diff_x = min(x1, ix1) - max(x0, ix0)
+                diff_y = min(y1, iy1) - max(y0, iy0)
+
+                if diff_y > diff_x:
+                    if x1 >= ix1:
+                        mid = (x0 + ix1) // 2
+                        ix1 = min(mid, ix1)
+                        x0 = max(mid + 1, x0)
+                    else:
+                        mid = (ix0 + x1) // 2
+                        ix0 = max(mid + 1, ix0)
+                        x1 = min(mid, x1)
+                else:
+                    if y1 >= iy1:
+                        mid = (y0 + iy1) // 2
+                        y0 = max(mid + 1, y0)
+                        iy1 = min(iy1, mid)
+                    else:
+                        mid = (iy0 + y1) // 2
+                        y1 = min(y1, mid)
+                        iy0 = max(mid + 1, iy0)
+                res[i]["bbox"] = [ix0, iy0, ix1, iy1]
+                v["bbox"] = [x0, y0, x1, y1]
+
+        res.append(v)
+    return res
+
+
+def remove_overlap_between_bbox(spans):
+    return _remove_overlap_between_bbox(spans)