ocr模式下content type 抽象

26c23782 · 赵小蒙 · b6f051d8 · 26c23782 · 26c23782 · 26c23782
Commit 26c23782 authored Mar 14, 2024 by 赵小蒙
7 changed files
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
+from magic_pdf.libs.ocr_content_type import ContentType
 def mk_nlp_markdown(pdf_info_dict: dict):
    markdown = []
@@ -12,9 +15,9 @@ def mk_nlp_markdown(pdf_info_dict: dict):
                    if not span.get('content'):
                        continue
                    content = span['content'].replace('$', '\$')  # 转义$
-                    if span['type'] == 'inline_equation':
+                    if span['type'] == ContentType.InlineEquation:
                        content = f"${content}$"
-                    elif span['type'] == 'displayed_equation':
+                    elif span['type'] == ContentType.InterlineEquation:
                        content = f"$$\n{content}\n$$"
                    line_text += content + ' '
                # 在行末添加两个空格以强制换行
@@ -41,9 +44,9 @@ def mk_mm_markdown(pdf_info_dict: dict):
                            content = f"![]({span['image_path']})"
                    else:
                        content = span['content'].replace('$', '\$')  # 转义$
-                        if span['type'] == 'inline_equation':
+                        if span['type'] == ContentType.InlineEquation:
                            content = f"${content}$"
-                        elif span['type'] == 'displayed_equation':
+                        elif span['type'] == ContentType.InterlineEquation:
                            content = f"$$\n{content}\n$$"
                    line_text += content + ' '
                # 在行末添加两个空格以强制换行

--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
 from magic_pdf.libs.commons import fitz  # PyMuPDF
+from magic_pdf.libs.ocr_content_type import ContentType
 def draw_bbox_without_number(i, bbox_list, page, rgb_config):
    new_rgb = []
@@ -49,30 +51,30 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
 def draw_text_bbox(pdf_info_dict, input_path, out_path):
    text_list = []
    inline_equation_list = []
-    displayed_equation_list = []
+    interline_equation_list = []
    for page in pdf_info_dict.values():
        page_text_list = []
        page_inline_equation_list = []
-        page_displayed_equation_list = []
+        page_interline_equation_list = []
        for block in page['preproc_blocks']:
            for line in block['lines']:
                for span in line['spans']:
-                    if span['type'] == 'text':
+                    if span['type'] == ContentType.Text:
                        page_text_list.append(span['bbox'])
-                    elif span['type'] == 'inline_equation':
+                    elif span['type'] == ContentType.InlineEquation:
                        page_inline_equation_list.append(span['bbox'])
-                    elif span['type'] == 'displayed_equation':
+                    elif span['type'] == ContentType.InterlineEquation:
-                        page_displayed_equation_list.append(span['bbox'])
+                        page_interline_equation_list.append(span['bbox'])
        text_list.append(page_text_list)
        inline_equation_list.append(page_inline_equation_list)
-        displayed_equation_list.append(page_displayed_equation_list)
+        interline_equation_list.append(page_interline_equation_list)
    doc = fitz.open(input_path)
    for i, page in enumerate(doc):
        # 获取当前页面的数据
        draw_bbox_without_number(i, text_list, page, [255, 0, 0])
        draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
-        draw_bbox_without_number(i, displayed_equation_list, page, [0, 0, 255])
+        draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255])
    # Save the PDF
    doc.save(f"{out_path}/text.pdf")
--- a/magic_pdf/libs/ocr_content_type.py
+++ b/magic_pdf/libs/ocr_content_type.py
+class ContentType:
+    Image = "image"
+    Table = "table"
+    Text = "text"
+    InlineEquation = "inline_equation"
+    InterlineEquation = "interline_equation"
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
@@ -14,6 +14,7 @@ from magic_pdf.libs.commons import (
    get_docx_model_output,
 )
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
+from magic_pdf.libs.ocr_content_type import ContentType
 from magic_pdf.libs.safe_filename import sanitize_filename
 from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
 from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
@@ -44,10 +45,10 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
        'tables': tables,
        'interline_equations': interline_equations,
        'inline_equations': inline_equations,
-        'dropped_text_block': dropped_text_block,
+        'droped_text_block': dropped_text_block,
-        'dropped_image_block': dropped_image_block,
+        'droped_image_block': dropped_image_block,
-        'dropped_table_block': dropped_table_block,
+        'droped_table_block': dropped_table_block,
-        'dropped_bboxes': need_remove_spans_bboxes_dict,
+        'droped_bboxes': need_remove_spans_bboxes_dict,
    }
    return return_dict
@@ -164,7 +165,7 @@ def parse_pdf_by_ocr(
                #  1: 'image', # 图片
                #  7: 'table',       # 表格
                #  13: 'inline_equation',     # 行内公式
-                #  14: 'displayed_equation',      # 行间公式
+                #  14: 'interline_equation',      # 行间公式
                #  15: 'text',      # ocr识别文本
                """layout信息"""
                #  11: 'full column',   # 单栏
@@ -173,20 +174,20 @@ def parse_pdf_by_ocr(
                    "bbox": bbox,
                }
                if category_id == 1:
-                    span["type"] = "image"
+                    span["type"] = ContentType.Image
                elif category_id == 7:
-                    span["type"] = "table"
+                    span["type"] = ContentType.Table
                elif category_id == 13:
                    span["content"] = layout_det["latex"]
-                    span["type"] = "inline_equation"
+                    span["type"] = ContentType.InlineEquation
                elif category_id == 14:
                    span["content"] = layout_det["latex"]
-                    span["type"] = "displayed_equation"
+                    span["type"] = ContentType.InterlineEquation
                elif category_id == 15:
                    span["content"] = layout_det["text"]
-                    span["type"] = "text"
+                    span["type"] = ContentType.Text
                # print(span)
                spans.append(span)
            else:
@@ -213,7 +214,7 @@ def parse_pdf_by_ocr(
        # bbox去除粘连
        spans = remove_overlap_between_bbox(spans)
-        # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
+        # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
        spans = adjust_bbox_for_standalone_block(spans)

--- a/magic_pdf/pre_proc/ocr_cut_image.py
+++ b/magic_pdf/pre_proc/ocr_cut_image.py
 from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.ocr_content_type import ContentType
 from magic_pdf.libs.pdf_image_tools import cut_image
@@ -11,9 +12,9 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
    for span in spans:
        span_type = span['type']
-        if span_type == 'image':
+        if span_type == ContentType.Image:
            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
-        elif span_type == 'table':
+        elif span_type == ContentType.Table:
            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
    return spans
--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
@@ -2,6 +2,7 @@ from loguru import logger
 from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
    calculate_overlap_area_in_bbox1_area_ratio
+from magic_pdf.libs.ocr_content_type import ContentType
 # 将每一个line中的span从左到右排序
@@ -29,10 +30,10 @@ def merge_spans_to_line(spans):
    lines = []
    current_line = [spans[0]]
    for span in spans[1:]:
-        # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
+        # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
        # image和table类型，同上
-        if span['type'] in ["displayed_equation", "image", "table"] or any(
+        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
-                s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
+                s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
            # 则开始新行
            lines.append(current_line)
            current_line = [span]

--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
@@ -2,6 +2,7 @@ from loguru import logger
 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
    __is_overlaps_y_exceeds_threshold
+from magic_pdf.libs.ocr_content_type import ContentType
 def remove_overlaps_min_spans(spans):
@@ -49,22 +50,22 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
        for span in need_remove_spans:
            spans.remove(span)
            span['tag'] = drop_tag
-            if span['type'] in ['text', 'inline_equation', 'displayed_equation']:
+            if span['type'] in [ContentType.Text, ContentType.InlineEquation, ContentType.InterlineEquation]:
                dropped_text_block.append(span)
-            elif span['type'] == 'image':
+            elif span['type'] == ContentType.Image:
                dropped_image_block.append(span)
-            elif span['type'] == 'table':
+            elif span['type'] == ContentType.Table:
                dropped_table_block.append(span)
    return spans, dropped_text_block, dropped_image_block, dropped_table_block
 def adjust_bbox_for_standalone_block(spans):
-    # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
+    # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
    for sb_span in spans:
-        if sb_span['type'] in ["displayed_equation", "image", "table"]:
+        if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
            for text_span in spans:
-                if text_span['type'] in ['text', 'inline_equation']:
+                if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
                    # 判断span2的纵向高度是否被span所覆盖
                    if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
                        # 判断span2是否在span左边
@@ -81,7 +82,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
    lines = []
    current_line = [spans[0]]
-    if spans[0]["type"] in ["displayed_equation", "image", "table"]:
+    if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
        displayed_list.append(spans[0])
    line_first_y0 = spans[0]["bbox"][1]
@@ -91,16 +92,16 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
    for span in spans[1:]:
        # if span.get("content","") == "78.":
        #     print("debug")
-        # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
+        # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
        # image和table类型，同上
-        if span['type'] in ["displayed_equation", "image", "table"] or any(
+        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
-                s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
+                s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
            # 传入
-            if span["type"] in ["displayed_equation", "image", "table"]:
+            if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
                displayed_list.append(span)
            # 则开始新行
            lines.append(current_line)
-            if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
+            if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
            current_line = [span]
            line_first_y0 = span["bbox"][1]
@@ -125,7 +126,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
        # 添加最后一行
    if current_line:
        lines.append(current_line)
-        if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
+        if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
    for line in text_inline_lines:
        # 按照x0坐标排序
@@ -159,10 +160,10 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
                    span['bbox'], (0, y0, 0, y1)):
                # 调整公式类型
-                if span["type"] == "displayed_equation":
+                if span["type"] == ContentType.InterlineEquation:
                    # 最后一行是行间公式
                    if j + 1 >= len(text_inline_lines):
-                        span["type"] = "inline_equation"
+                        span["type"] = ContentType.InlineEquation
                        span["bbox"][1] = y0
                        span["bbox"][3] = y1
                    else:
@@ -170,7 +171,7 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
                        y0_next, y1_next = text_inline_lines[j + 1][1]
                        if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
                                y1 - y0) > span_y - span_y0:
-                            span["type"] = "inline_equation"
+                            span["type"] = ContentType.InlineEquation
                            span["bbox"][1] = y0
                            span["bbox"][3] = y1
                break
@@ -193,13 +194,13 @@ def get_qa_need_list(blocks):
    for block in blocks:
        for line in block["lines"]:
            for span in line["spans"]:
-                if span["type"] == "image":
+                if span["type"] == ContentType.Image:
                    images.append(span)
-                elif span["type"] == "table":
+                elif span["type"] == ContentType.Table:
                    tables.append(span)
-                elif span["type"] == "inline_equation":
+                elif span["type"] == ContentType.InlineEquation:
                    inline_equations.append(span)
-                elif span["type"] == "displayed_equation":
+                elif span["type"] == ContentType.InterlineEquation:
                    interline_equations.append(span)
                else:
                    continue