Commit 2277e31f authored by 赵小蒙's avatar 赵小蒙

ocr_demo main函数精简

parent 7d010e19
...@@ -6,7 +6,13 @@ from pathlib import Path ...@@ -6,7 +6,13 @@ from pathlib import Path
from app.common.s3 import get_s3_config from app.common.s3 import get_s3_config
from demo.demo_test import get_json_from_local_or_s3 from demo.demo_test import get_json_from_local_or_s3
from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para, ocr_mk_nlp_markdown, ocr_mk_mm_markdown, ocr_mk_mm_standard_format from magic_pdf.dict2md.ocr_mkcontent import (
ocr_mk_mm_markdown_with_para,
ocr_mk_nlp_markdown,
ocr_mk_mm_markdown,
ocr_mk_mm_standard_format,
ocr_mk_mm_markdown_with_para_and_pagination
)
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
...@@ -47,7 +53,7 @@ def ocr_online_parse(book_name, start_page_id=0, debug_mode=True): ...@@ -47,7 +53,7 @@ def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
# logger.info(json_object) # logger.info(json_object)
s3_pdf_path = json_object["file_location"] s3_pdf_path = json_object["file_location"]
s3_config = get_s3_config(s3_pdf_path) s3_config = get_s3_config(s3_pdf_path)
ocr_pdf_model_info = json_object["doc_layout_result"] ocr_pdf_model_info = json_object.get("doc_layout_result")
ocr_parse_core(book_name, s3_pdf_path, ocr_pdf_model_info, s3_config=s3_config) ocr_parse_core(book_name, s3_pdf_path, ocr_pdf_model_info, s3_config=s3_config)
except Exception as e: except Exception as e:
logger.exception(e) logger.exception(e)
...@@ -72,6 +78,7 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0, ...@@ -72,6 +78,7 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
# markdown_content = mk_nlp_markdown(pdf_info_dict) # markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict) markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict)
# markdown_pagination = ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict)
with open(text_content_save_path, "w", encoding="utf-8") as f: with open(text_content_save_path, "w", encoding="utf-8") as f:
f.write(markdown_content) f.write(markdown_content)
...@@ -83,14 +90,9 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0, ...@@ -83,14 +90,9 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
if __name__ == '__main__': if __name__ == '__main__':
#ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf" # pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
#ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json" # json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf" # ocr_local_parse(pdf_path, json_file_path)
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json" # book_name = "数学新星网/edu_00001236"
# ocr_online_parse(book_name)
ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf" pass
ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
# ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
ocr_local_parse(ocr_pdf_path, ocr_json_file_path)
#ocr_online_parse(book_name="美国加州中学教材/edu_00000060")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment