Commit 9bd6294b authored by 赵小蒙's avatar 赵小蒙

ocr模式更新spark pipeline

parent 8a52ada3
from magic_pdf.libs.ocr_content_type import ContentType from magic_pdf.libs.ocr_content_type import ContentType
def mk_nlp_markdown(pdf_info_dict: dict): def ocr_mk_nlp_markdown(pdf_info_dict: dict):
markdown = [] markdown = []
for _, page_info in pdf_info_dict.items(): for _, page_info in pdf_info_dict.items():
...@@ -25,7 +25,7 @@ def mk_nlp_markdown(pdf_info_dict: dict): ...@@ -25,7 +25,7 @@ def mk_nlp_markdown(pdf_info_dict: dict):
return '\n'.join(markdown) return '\n'.join(markdown)
def mk_mm_markdown(pdf_info_dict: dict): def ocr_mk_mm_markdown(pdf_info_dict: dict):
markdown = [] markdown = []
......
...@@ -3,6 +3,7 @@ import sys ...@@ -3,6 +3,7 @@ import sys
import time import time
from urllib.parse import quote from urllib.parse import quote
from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown
from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time
from magic_pdf.libs.drop_reason import DropReason from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.json_compressor import JsonCompressor from magic_pdf.libs.json_compressor import JsonCompressor
...@@ -13,6 +14,7 @@ from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan ...@@ -13,6 +14,7 @@ from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
from loguru import logger from loguru import logger
from app.common.s3 import get_s3_config, get_s3_client from app.common.s3 import get_s3_config, get_s3_client
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
def exception_handler(jso: dict, e): def exception_handler(jso: dict, e):
...@@ -312,7 +314,77 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: ...@@ -312,7 +314,77 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
pass # 检测debug开关
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get('need_drop', False):
return jso
s3_pdf_path = jso.get('file_location')
s3_config = get_s3_config(s3_pdf_path)
model_output_json_list = jso.get('doc_layout_result')
data_source = get_data_source(jso)
file_id = jso.get('file_id')
book_name = data_source + "/" + file_id
try:
save_path = "s3://mllm-raw-media/pdf2md_img/"
image_s3_config = get_s3_config(save_path)
start_time = time.time() # 记录开始时间
# 先打印一下book_name和解析开始的时间
logger.info(f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", file=sys.stderr)
pdf_info_dict = parse_pdf_by_ocr(
s3_pdf_path,
s3_config,
model_output_json_list,
save_path,
book_name,
pdf_model_profile=None,
image_s3_config=image_s3_config,
start_page_id=start_page_id,
debug_mode=debug_mode
)
if pdf_info_dict.get('need_drop', False): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
jso['need_drop'] = True
jso['drop_reason'] = pdf_info_dict["drop_reason"]
else: # 正常返回,将 pdf_info_dict 压缩并存储
pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
jso['pdf_intermediate_dict'] = pdf_info_dict
end_time = time.time() # 记录完成时间
parse_time = int(end_time - start_time) # 计算执行时间
# 解析完成后打印一下book_name和耗时
logger.info(f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
file=sys.stderr)
jso['parse_time'] = parse_time
except Exception as e:
jso = exception_handler(jso, e)
return jso
def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get('need_drop', False):
book_name = join_path(get_data_source(jso), jso['file_id'])
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
jso["dropped"] = True
return jso
try:
pdf_intermediate_dict = jso['pdf_intermediate_dict']
# 将 pdf_intermediate_dict 解压
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
markdown_content = ocr_mk_mm_markdown(pdf_intermediate_dict)
jso["content"] = markdown_content
logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr)
# 把无用的信息清空
jso["doc_layout_result"] = ""
jso["pdf_intermediate_dict"] = ""
jso["pdf_meta"] = ""
except Exception as e:
jso = exception_handler(jso, e)
return jso
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment