Commit d5e30f8d authored by 赵小蒙's avatar 赵小蒙

fix:use deepcopy keep the original model json

parent 8e537ed5
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
效果: 效果:
python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350 python magicpdf.py json-command --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf python magicpdf.py pdf-command --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
""" """
import os import os
...@@ -45,6 +45,7 @@ from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter ...@@ -45,6 +45,7 @@ from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
import csv import csv
import copy
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"]) parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
...@@ -81,6 +82,7 @@ def do_parse( ...@@ -81,6 +82,7 @@ def do_parse(
f_dump_orig_pdf=True, f_dump_orig_pdf=True,
f_dump_content_list=True, f_dump_content_list=True,
): ):
orig_model_list = copy.deepcopy(model_list)
local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method) local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir) image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
...@@ -130,7 +132,7 @@ def do_parse( ...@@ -130,7 +132,7 @@ def do_parse(
if f_dump_model_json: if f_dump_model_json:
"""写model_json""" """写model_json"""
md_writer.write( md_writer.write(
content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4), content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
path=f"{pdf_file_name}_model.json", path=f"{pdf_file_name}_model.json",
mode=AbsReaderWriter.MODE_TXT, mode=AbsReaderWriter.MODE_TXT,
) )
...@@ -143,7 +145,7 @@ def do_parse( ...@@ -143,7 +145,7 @@ def do_parse(
mode=AbsReaderWriter.MODE_BIN, mode=AbsReaderWriter.MODE_BIN,
) )
content_list = pipe.pipe_mk_uni_format(str(image_dir), drop_mode=DropMode.NONE) content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
if f_dump_content_list: if f_dump_content_list:
"""写content_list""" """写content_list"""
md_writer.write( md_writer.write(
...@@ -278,7 +280,7 @@ def pdf_command(pdf, model, method): ...@@ -278,7 +280,7 @@ def pdf_command(pdf, model, method):
model_path = pdf.replace(".pdf", ".json") model_path = pdf.replace(".pdf", ".json")
if not os.path.exists(model_path): if not os.path.exists(model_path):
logger.warning( logger.warning(
f"not found json {model_path} existed, use paddle analyze" f"not found json {model_path} existed"
) )
# 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
model_json = "[]" model_json = "[]"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment