lkw

83deab21 · liukaiwen · c38c784e · 83deab21 · 83deab21 · 83deab21
Commit 83deab21 authored Mar 11, 2024 by liukaiwen
Hide whitespace changes
Inline Side-by-side

Showing with 65 additions and 15 deletions

draw_bbox.py demo/draw_bbox.py +30 -5

ocr_dict_merge.py magic_pdf/libs/ocr_dict_merge.py +33 -8

pdf_parse_by_ocr.py magic_pdf/pdf_parse_by_ocr.py +2 -2

No files found.
--- a/demo/draw_bbox.py
+++ b/demo/draw_bbox.py
 from magic_pdf.libs.commons import fitz  # PyMuPDF
+from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
+import json
+
+
+
+
+
+def read_json_file(file_path):
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    return data
+

 # PDF文件路径
-pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf"
+pdf_path = "D:\\projects\\Magic-PDF\\ocr_demo\\ocr_0_org.pdf"

 doc = fitz.open(pdf_path)  # Open the PDF
 # 你的数据
 data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]]
-
+ocr_json_file_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_0.json"
+ocr_pdf_info = read_json_file(ocr_json_file_path)
+pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
+data_list = []
+for page in pdf_info_dict.values():
+    page_list = []
+    blocks = page.get("preproc_blocks")
+    for block in blocks:
+        lines = block.get("lines")
+        for line in lines:
+            spans = line.get("spans")
+            for span in spans:
+                page_list.append(span["bbox"])
+    data_list.append(page_list)
 # 对每个页面进行处理
 for i, page in enumerate(doc):
    # 获取当前页面的数据
-    page_data = data[i]
+    page_data = data_list[i]
    for img in page_data:
-        x0, y0, x1, y1, _ = img
+        x0, y0, x1, y1 = img
        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
        page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True)  # Draw the rectangle

 # Save the PDF
-doc.save("D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018_new.pdf")
\ No newline at end of file
+doc.save("D:\\projects\\Magic-PDF\\ocr_demo\\ocr_0_new.pdf")
\ No newline at end of file
--- a/magic_pdf/libs/ocr_dict_merge.py
+++ b/magic_pdf/libs/ocr_dict_merge.py
@@ -74,6 +74,7 @@ def modify_y_axis(spans: list):
    current_line = [spans[0]]
    if spans[0]["type"] in ["displayed_equation", "image", "table"]:
        displayed_list.append(spans[0])
+
    line_first_y0 = spans[0]["bbox"][1]
    line_first_y = spans[0]["bbox"][3]
    #用于给行间公式搜索
@@ -89,15 +90,16 @@ def modify_y_axis(spans: list):
            # 则开始新行
            lines.append(current_line)
            current_line = [span]
-            line_first_y0 = spans[0]["bbox"][1]
-            line_first_y = spans[0]["bbox"][3]
+            line_first_y0 = span["bbox"][1]
+            line_first_y = span["bbox"][3]
            continue

        # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
        if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
-
-            span["bbox"][1] = line_first_y0
-            span["bbox"][3] = line_first_y
+            if span["bbox"][1] < line_first_y0:
+                line_first_y0 = span["bbox"][1]
+            if span["bbox"][3] > line_first_y:
+                line_first_y = span["bbox"][3]
            current_line.append(span)

        else:
@@ -111,18 +113,41 @@ def modify_y_axis(spans: list):
        # 添加最后一行
    if current_line:
        lines.append(current_line)
+        if len(current_line)>1 or current_line[0]["type"] in ["text", "inline_equation"]:
+            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))

    for line in text_inline_lines:
        # 按照x0坐标排序
-        line.sort(key=lambda span: span[0]['bbox'][0])
-
+        current_line = line[0]
+        current_line.sort(key=lambda span: span['bbox'][0])


+    #调整每一个文字行内bbox统一
+    for line in text_inline_lines:
+        current_line, (line_first_y0, line_first_y) = line
+        for span in current_line:
+            span["bbox"][1] = line_first_y0
+            span["bbox"][3] = line_first_y
    #错误行间公式转行内公式
+    j = 0
    for i in range(len(displayed_list)):
        span = displayed_list[i]
+        span_y0, span_y = span["bbox"][1], span["bbox"][3]
+        while j < len(text_inline_lines):
+            text_line = text_inline_lines[j]
+            y0, y1 = text_line[1]
+            if span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1 and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
+                span["bbox"][1] = y0
+                span["bbox"][3] = y1
+                if span["type"] == "displayed_equation":
+                    span["type"] = "inline_equation"
+                break
+            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
+                break
+            else:
+                j += 1

-
+    return spans




--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
 from loguru import logger

-from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
+from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans, modify_y_axis


 def construct_page_component(page_id, blocks):
@@ -68,7 +68,7 @@ def parse_pdf_by_ocr(
        spans = remove_overlaps_min_spans(spans)

        # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
-
+        #spans = modify_y_axis(spans)

        # 将spans合并成line(从上到下,从左到右)
        lines = merge_spans_to_line(spans)