Commit ce96c3f6 authored by 赵小蒙's avatar 赵小蒙

为ocr模式的demo增加online模式,pipeline进行微调适配online模式

parent 49bf40cc
...@@ -34,7 +34,7 @@ def get_json_from_local_or_s3(book_name=None): ...@@ -34,7 +34,7 @@ def get_json_from_local_or_s3(book_name=None):
s3_config = get_s3_config(json_path) s3_config = get_s3_config(json_path)
file_content = read_file(json_path, s3_config) file_content = read_file(json_path, s3_config)
json_str = file_content.decode("utf-8") json_str = file_content.decode("utf-8")
logger.info(json_str) # logger.info(json_str)
json_object = json.loads(json_str) json_object = json.loads(json_str)
return json_object return json_object
......
...@@ -4,6 +4,7 @@ import os ...@@ -4,6 +4,7 @@ import os
from loguru import logger from loguru import logger
from pathlib import Path from pathlib import Path
from app.common.s3 import get_s3_config
from demo.demo_test import get_json_from_local_or_s3 from demo.demo_test import get_json_from_local_or_s3
from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para, ocr_mk_nlp_markdown, ocr_mk_mm_markdown, ocr_mk_mm_standard_format from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para, ocr_mk_nlp_markdown, ocr_mk_mm_markdown, ocr_mk_mm_standard_format
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
...@@ -35,50 +36,58 @@ def ocr_local_parse(ocr_pdf_path, ocr_json_file_path): ...@@ -35,50 +36,58 @@ def ocr_local_parse(ocr_pdf_path, ocr_json_file_path):
ocr_pdf_model_info = read_json_file(ocr_json_file_path) ocr_pdf_model_info = read_json_file(ocr_json_file_path)
pth = Path(ocr_json_file_path) pth = Path(ocr_json_file_path)
book_name = pth.name book_name = pth.name
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest") ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info)
save_path = join_path(save_tmp_path, "md")
save_path_with_bookname = os.path.join(save_path, book_name)
text_content_save_path = f"{save_path_with_bookname}/book.md"
pdf_info_dict = parse_pdf_by_ocr(
ocr_pdf_path,
None,
ocr_pdf_model_info,
save_path,
book_name,
debug_mode=True)
parent_dir = os.path.dirname(text_content_save_path)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
# markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict)
with open(text_content_save_path, "w", encoding="utf-8") as f:
f.write(markdown_content)
standard_format = ocr_mk_mm_standard_format(pdf_info_dict)
standard_format_save_path = f"{save_path_with_bookname}/standard_format.txt"
with open(standard_format_save_path, "w", encoding="utf-8") as f:
f.write(str(standard_format))
# logger.info(markdown_content)
# save_markdown(markdown_text, ocr_json_file_path)
except Exception as e: except Exception as e:
logger.exception(e) logger.exception(e)
def ocr_online_parse(book_name, start_page_id=0, debug_mode=True): def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
json_object = get_json_from_local_or_s3(book_name) try:
logger.info(json_object) json_object = get_json_from_local_or_s3(book_name)
# logger.info(json_object)
s3_pdf_path = json_object["file_location"]
s3_config = get_s3_config(s3_pdf_path)
ocr_pdf_model_info = json_object["doc_layout_result"]
ocr_parse_core(book_name, s3_pdf_path, ocr_pdf_model_info, s3_config=s3_config)
except Exception as e:
logger.exception(e)
def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0, s3_config=None):
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
save_path = join_path(save_tmp_path, "md")
save_path_with_bookname = os.path.join(save_path, book_name)
text_content_save_path = f"{save_path_with_bookname}/book.md"
pdf_info_dict = parse_pdf_by_ocr(
ocr_pdf_path,
s3_config,
ocr_pdf_model_info,
save_path,
book_name,
debug_mode=True)
parent_dir = os.path.dirname(text_content_save_path)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
# markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict)
with open(text_content_save_path, "w", encoding="utf-8") as f:
f.write(markdown_content)
standard_format = ocr_mk_mm_standard_format(pdf_info_dict)
standard_format_save_path = f"{save_path_with_bookname}/standard_format.txt"
with open(standard_format_save_path, "w", encoding="utf-8") as f:
f.write(str(standard_format))
if __name__ == '__main__': if __name__ == '__main__':
#ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf" # pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
#ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json" # json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf" # pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json" # json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf" # pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json" # json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
# ocr_local_parse(pdf_path, json_file_path)
ocr_online_parse(book_name="数学新星网/edu_00001236") ocr_online_parse(book_name="数学新星网/edu_00001236")
ocr_local_parse(ocr_pdf_path, ocr_json_file_path)
pass
...@@ -27,7 +27,7 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config): ...@@ -27,7 +27,7 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config):
page.insert_text((x0, y0), str(j + 1), fontsize=10, color=new_rgb) # Insert the index at the top left corner of the rectangle page.insert_text((x0, y0), str(j + 1), fontsize=10, color=new_rgb) # Insert the index at the top left corner of the rectangle
def draw_layout_bbox(pdf_info_dict, input_path, out_path): def draw_layout_bbox(pdf_info_dict, pdf_bytes, out_path):
layout_bbox_list = [] layout_bbox_list = []
dropped_bbox_list = [] dropped_bbox_list = []
for page in pdf_info_dict.values(): for page in pdf_info_dict.values():
...@@ -40,15 +40,14 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path): ...@@ -40,15 +40,14 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
for dropped_bbox in dropped_bboxes: for dropped_bbox in dropped_bboxes:
page_dropped_list.append(dropped_bbox) page_dropped_list.append(dropped_bbox)
dropped_bbox_list.append(page_dropped_list) dropped_bbox_list.append(page_dropped_list)
pdf_docs = fitz.open("pdf", pdf_bytes)
doc = fitz.open(input_path) for i, page in enumerate(pdf_docs):
for i, page in enumerate(doc):
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0]) draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0])
draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0]) draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0])
# Save the PDF # Save the PDF
doc.save(f"{out_path}/layout.pdf") pdf_docs.save(f"{out_path}/layout.pdf")
def draw_text_bbox(pdf_info_dict, input_path, out_path): def draw_text_bbox(pdf_info_dict, pdf_bytes, out_path):
text_list = [] text_list = []
inline_equation_list = [] inline_equation_list = []
interline_equation_list = [] interline_equation_list = []
...@@ -68,13 +67,12 @@ def draw_text_bbox(pdf_info_dict, input_path, out_path): ...@@ -68,13 +67,12 @@ def draw_text_bbox(pdf_info_dict, input_path, out_path):
text_list.append(page_text_list) text_list.append(page_text_list)
inline_equation_list.append(page_inline_equation_list) inline_equation_list.append(page_inline_equation_list)
interline_equation_list.append(page_interline_equation_list) interline_equation_list.append(page_interline_equation_list)
pdf_docs = fitz.open("pdf", pdf_bytes)
doc = fitz.open(input_path) for i, page in enumerate(pdf_docs):
for i, page in enumerate(doc):
# 获取当前页面的数据 # 获取当前页面的数据
draw_bbox_without_number(i, text_list, page, [255, 0, 0]) draw_bbox_without_number(i, text_list, page, [255, 0, 0])
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0]) draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255]) draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255])
# Save the PDF # Save the PDF
doc.save(f"{out_path}/text.pdf") pdf_docs.save(f"{out_path}/text.pdf")
...@@ -282,7 +282,7 @@ def parse_pdf_by_ocr( ...@@ -282,7 +282,7 @@ def parse_pdf_by_ocr(
json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4) json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
# drow_bbox # drow_bbox
draw_layout_bbox(pdf_info_dict, pdf_path, md_bookname_save_path) draw_layout_bbox(pdf_info_dict, pdf_bytes, md_bookname_save_path)
draw_text_bbox(pdf_info_dict, pdf_path, md_bookname_save_path) draw_text_bbox(pdf_info_dict, pdf_bytes, md_bookname_save_path)
return pdf_info_dict return pdf_info_dict
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment