为ocr模式的demo增加online模式，pipeline进行微调适配online模式

ce96c3f6 · 赵小蒙 · 49bf40cc · ce96c3f6 · ce96c3f6 · ce96c3f6
Commit ce96c3f6 authored Mar 20, 2024 by 赵小蒙
Showing with 59 additions and 52 deletions

demo_test.py demo/demo_test.py +1 -1

ocr_demo.py demo/ocr_demo.py +48 -39

draw_bbox.py magic_pdf/libs/draw_bbox.py +8 -10

pdf_parse_by_ocr.py magic_pdf/pdf_parse_by_ocr.py +2 -2

No files found.
--- a/demo/demo_test.py
+++ b/demo/demo_test.py
@@ -34,7 +34,7 @@ def get_json_from_local_or_s3(book_name=None):
        s3_config = get_s3_config(json_path)
        file_content = read_file(json_path, s3_config)
        json_str = file_content.decode("utf-8")
-        logger.info(json_str)
+        # logger.info(json_str)
        json_object = json.loads(json_str)
    return json_object

--- a/demo/ocr_demo.py
+++ b/demo/ocr_demo.py
@@ -4,6 +4,7 @@ import os
 from loguru import logger
 from pathlib import Path
+from app.common.s3 import get_s3_config
 from demo.demo_test import get_json_from_local_or_s3
 from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para, ocr_mk_nlp_markdown, ocr_mk_mm_markdown, ocr_mk_mm_standard_format
 from magic_pdf.libs.commons import join_path
@@ -35,50 +36,58 @@ def ocr_local_parse(ocr_pdf_path, ocr_json_file_path):
        ocr_pdf_model_info = read_json_file(ocr_json_file_path)
        pth = Path(ocr_json_file_path)
        book_name = pth.name
-        save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
+        ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info)
-        save_path = join_path(save_tmp_path, "md")
-        save_path_with_bookname = os.path.join(save_path, book_name)
-        text_content_save_path = f"{save_path_with_bookname}/book.md"
-        pdf_info_dict = parse_pdf_by_ocr(
-            ocr_pdf_path,
-            None,
-            ocr_pdf_model_info,
-            save_path,
-            book_name,
-            debug_mode=True)
-        parent_dir = os.path.dirname(text_content_save_path)
-        if not os.path.exists(parent_dir):
-            os.makedirs(parent_dir)
-        # markdown_content = mk_nlp_markdown(pdf_info_dict)
-        markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict)
-        with open(text_content_save_path, "w", encoding="utf-8") as f:
-            f.write(markdown_content)
-        standard_format = ocr_mk_mm_standard_format(pdf_info_dict)
-        standard_format_save_path = f"{save_path_with_bookname}/standard_format.txt"
-        with open(standard_format_save_path, "w", encoding="utf-8") as f:
-            f.write(str(standard_format))
-        # logger.info(markdown_content)
-        # save_markdown(markdown_text, ocr_json_file_path)
    except Exception as e:
        logger.exception(e)
 def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
-    json_object = get_json_from_local_or_s3(book_name)
+    try:
-    logger.info(json_object)
+        json_object = get_json_from_local_or_s3(book_name)
+        # logger.info(json_object)
+        s3_pdf_path = json_object["file_location"]
+        s3_config = get_s3_config(s3_pdf_path)
+        ocr_pdf_model_info = json_object["doc_layout_result"]
+        ocr_parse_core(book_name, s3_pdf_path, ocr_pdf_model_info, s3_config=s3_config)
+    except Exception as e:
+        logger.exception(e)
+def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0, s3_config=None):
+    save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
+    save_path = join_path(save_tmp_path, "md")
+    save_path_with_bookname = os.path.join(save_path, book_name)
+    text_content_save_path = f"{save_path_with_bookname}/book.md"
+    pdf_info_dict = parse_pdf_by_ocr(
+        ocr_pdf_path,
+        s3_config,
+        ocr_pdf_model_info,
+        save_path,
+        book_name,
+        debug_mode=True)
+    parent_dir = os.path.dirname(text_content_save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+    # markdown_content = mk_nlp_markdown(pdf_info_dict)
+    markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict)
+    with open(text_content_save_path, "w", encoding="utf-8") as f:
+        f.write(markdown_content)
+    standard_format = ocr_mk_mm_standard_format(pdf_info_dict)
+    standard_format_save_path = f"{save_path_with_bookname}/standard_format.txt"
+    with open(standard_format_save_path, "w", encoding="utf-8") as f:
+        f.write(str(standard_format))
 if __name__ == '__main__':
-    #ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
+    # pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
-    #ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
+    # json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
-    # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
+    # pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
-    # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
+    # json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
-    ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
+    # pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
-    ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
+    # json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
+    # ocr_local_parse(pdf_path, json_file_path)
    ocr_online_parse(book_name="数学新星网/edu_00001236")
-    ocr_local_parse(ocr_pdf_path, ocr_json_file_path)
-    pass
--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
@@ -27,7 +27,7 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config):
        page.insert_text((x0, y0), str(j + 1), fontsize=10, color=new_rgb)  # Insert the index at the top left corner of the rectangle
-def draw_layout_bbox(pdf_info_dict, input_path, out_path):
+def draw_layout_bbox(pdf_info_dict, pdf_bytes, out_path):
    layout_bbox_list = []
    dropped_bbox_list = []
    for page in pdf_info_dict.values():
@@ -40,15 +40,14 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
            for dropped_bbox in dropped_bboxes:
                page_dropped_list.append(dropped_bbox)
        dropped_bbox_list.append(page_dropped_list)
+    pdf_docs = fitz.open("pdf", pdf_bytes)
-    doc = fitz.open(input_path)
+    for i, page in enumerate(pdf_docs):
-    for i, page in enumerate(doc):
        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0])
        draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0])
    # Save the PDF
-    doc.save(f"{out_path}/layout.pdf")
+    pdf_docs.save(f"{out_path}/layout.pdf")
-def draw_text_bbox(pdf_info_dict, input_path, out_path):
+def draw_text_bbox(pdf_info_dict, pdf_bytes, out_path):
    text_list = []
    inline_equation_list = []
    interline_equation_list = []
@@ -68,13 +67,12 @@ def draw_text_bbox(pdf_info_dict, input_path, out_path):
        text_list.append(page_text_list)
        inline_equation_list.append(page_inline_equation_list)
        interline_equation_list.append(page_interline_equation_list)
+    pdf_docs = fitz.open("pdf", pdf_bytes)
-    doc = fitz.open(input_path)
+    for i, page in enumerate(pdf_docs):
-    for i, page in enumerate(doc):
        # 获取当前页面的数据
        draw_bbox_without_number(i, text_list, page, [255, 0, 0])
        draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
        draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255])
    # Save the PDF
-    doc.save(f"{out_path}/text.pdf")
+    pdf_docs.save(f"{out_path}/text.pdf")
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
@@ -282,7 +282,7 @@ def parse_pdf_by_ocr(
            json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
        # drow_bbox
-        draw_layout_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
+        draw_layout_bbox(pdf_info_dict, pdf_bytes, md_bookname_save_path)
-        draw_text_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
+        draw_text_bbox(pdf_info_dict, pdf_bytes, md_bookname_save_path)
    return pdf_info_dict