ocr拼接逻辑更新

caa1588a · 赵小蒙 · a0be4652 · caa1588a · caa1588a
Commit caa1588a authored Mar 07, 2024 by 赵小蒙
Hide whitespace changes
Inline Side-by-side

Showing with 55 additions and 25 deletions

ocr_dict_merge.py magic_pdf/libs/ocr_dict_merge.py +17 -3

pdf_parse_by_ocr.py magic_pdf/pdf_parse_by_ocr.py +38 -22

No files found.
--- a/magic_pdf/libs/ocr_dict_merge.py
+++ b/magic_pdf/libs/ocr_dict_merge.py
-from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
+from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio
-def merge_spans(spans):
+# 删除重叠spans中较小的那些
+def remove_overlaps_min_spans(spans):
+    for span1 in spans.copy():
+        for span2 in spans.copy():
+            if span1 != span2:
+                overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
+                if overlap_box is not None:
+                    bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
+                    if bbox_to_remove is not None:
+                        spans.remove(bbox_to_remove)
+    return spans
+def merge_spans_to_line(spans):
    # 按照y0坐标排序
    spans.sort(key=lambda span: span['bbox'][1])
@@ -9,7 +22,8 @@ def merge_spans(spans):
    current_line = [spans[0]]
    for span in spans[1:]:
        # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
-        if span['type'] == "displayed_equation" or any(s['type'] == "displayed_equation" for s in current_line):
+        # image和table类型，同上
+        if span['type'] in ["displayed_equation", "image", "table"] or any(s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
            # 则开始新行
            lines.append(current_line)
            current_line = [span]

--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
-from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
+from loguru import logger
-from magic_pdf.libs.ocr_dict_merge import merge_spans
+from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
-def construct_page_component(page_id, text_blocks_preproc):
+def construct_page_component(page_id, blocks):
    return_dict = {
-        'preproc_blocks': text_blocks_preproc,
+        'preproc_blocks': blocks,
-        'page_idx': page_id
+        'page_idx': page_id,
    }
    return return_dict
@@ -24,17 +25,32 @@ def parse_pdf_by_ocr(
        spans = []
        for layout_det in layout_dets:
            category_id = layout_det['category_id']
-            allow_category_id_list = [13, 14, 15]
+            allow_category_id_list = [1, 7, 13, 14, 15]
            if category_id in allow_category_id_list:
                x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
                bbox = [int(x0), int(y0), int(x1), int(y1)]
-                #  13: 'embedding',     # 嵌入公式
+                '''要删除的'''
-                #  14: 'isolated',      # 单行公式
+                #  3: 'header',      # 页眉
-                #  15: 'ocr_text',      # ocr识别文本
+                #  4: 'page number', # 页码
+                #  5: 'footnote',    # 脚注
+                #  6: 'footer',      # 页脚
+                '''当成span拼接的'''
+                #  1: 'image', # 图片
+                #  7: 'table',       # 表格
+                #  13: 'inline_equation',     # 行内公式
+                #  14: 'displayed_equation',      # 行间公式
+                #  15: 'text',      # ocr识别文本
+                '''layout信息'''
+                #  11: 'full column',   # 单栏
+                #  12: 'sub column',    # 多栏
                span = {
                    'bbox': bbox,
                }
-                if category_id == 13:
+                if category_id == 1:
+                    span['type'] = 'image'
+                elif category_id == 7:
+                    span['type'] = 'table'
+                elif category_id == 13:
                    span['content'] = layout_det['latex']
                    span['type'] = 'inline_equation'
                elif category_id == 14:
@@ -48,18 +64,18 @@ def parse_pdf_by_ocr(
            else:
                continue
-        # 合并重叠的spans
+        # 删除重叠spans中较小的那些
-        for span1 in spans.copy():
+        spans = remove_overlaps_min_spans(spans)
-            for span2 in spans.copy():
-                if span1 != span2:
+        # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
-                    overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
-                    if overlap_box is not None:
-                        bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
+        # 将spans合并成line(从上到下,从左到右)
-                        if bbox_to_remove is not None:
+        lines = merge_spans_to_line(spans)
-                            spans.remove(bbox_to_remove)
+        # logger.info(lines)
-        # 将spans合并成line
+        # 从ocr_page_info中获取layout信息
-        lines = merge_spans(spans)
        # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
        blocks = []