Commit cc568d5e authored by blue's avatar blue

feat: update magic cli

parent 924b33c4
...@@ -50,9 +50,7 @@ parse_pdf_methods = click.Choice(["ocr", "txt", "auto"]) ...@@ -50,9 +50,7 @@ parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
def prepare_env(pdf_file_name, method): def prepare_env(pdf_file_name, method):
local_parent_dir = os.path.join( local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
get_local_dir(), "magic-pdf", pdf_file_name, method
)
local_image_dir = os.path.join(str(local_parent_dir), "images") local_image_dir = os.path.join(str(local_parent_dir), "images")
local_md_dir = local_parent_dir local_md_dir = local_parent_dir
...@@ -62,7 +60,7 @@ def prepare_env(pdf_file_name, method): ...@@ -62,7 +60,7 @@ def prepare_env(pdf_file_name, method):
def write_to_csv(csv_file_path, csv_data): def write_to_csv(csv_file_path, csv_data):
with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csvfile: with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
# 创建csv writer对象 # 创建csv writer对象
csv_writer = csv.writer(csvfile) csv_writer = csv.writer(csvfile)
# 写入数据 # 写入数据
...@@ -70,12 +68,28 @@ def write_to_csv(csv_file_path, csv_data): ...@@ -70,12 +68,28 @@ def write_to_csv(csv_file_path, csv_data):
print(f"数据已成功追加到 '{csv_file_path}'") print(f"数据已成功追加到 '{csv_file_path}'")
def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir): def do_parse(
pdf_file_name,
pdf_bytes,
model_list,
parse_method,
f_draw_span_bbox=True,
f_draw_layout_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_json=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
):
local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
image_dir = (os.path.basename(local_image_dir),)
if parse_method == "auto": if parse_method == "auto":
jso_useful_key = { jso_useful_key = {"_pdf_type": "", "model_list": model_list}
"_pdf_type": "",
"model_list": model_list
}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True) pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
elif parse_method == "txt": elif parse_method == "txt":
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True) pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
...@@ -87,48 +101,62 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, ...@@ -87,48 +101,62 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
pipe.pipe_classify() pipe.pipe_classify()
'''如果没有传入有效的模型数据,则使用内置paddle解析''' """如果没有传入有效的模型数据,则使用内置paddle解析"""
if len(model_list) == 0: if len(model_list) == 0:
pipe.pipe_analyze() pipe.pipe_analyze()
pipe.pipe_parse() pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data['pdf_info'] pdf_info = pipe.pdf_mid_data["pdf_info"]
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir) if f_draw_layout_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir) draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
# write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv", # write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv",
# [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']]) # [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE) md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
'''写markdown'''
md_writer.write( if f_dump_md:
content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT """写markdown"""
) md_writer.write(
'''写middle_json''' content=md_content,
md_writer.write( path=f"{pdf_file_name}.md",
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4), mode=AbsReaderWriter.MODE_TXT,
path=f"{pdf_file_name}_middle.json", )
mode=AbsReaderWriter.MODE_TXT,
) if f_dump_middle_json:
'''写model_json''' """写middle_json"""
md_writer.write( md_writer.write(
content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4), content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
path=f"{pdf_file_name}_model.json", path=f"{pdf_file_name}_middle.json",
mode=AbsReaderWriter.MODE_TXT, mode=AbsReaderWriter.MODE_TXT,
) )
'''写源pdf'''
md_writer.write( if f_dump_model_json:
content=pdf_bytes, """写model_json"""
path=f"{pdf_file_name}_origin.pdf", md_writer.write(
mode=AbsReaderWriter.MODE_BIN, content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
) path=f"{pdf_file_name}_model.json",
mode=AbsReaderWriter.MODE_TXT,
)
if f_dump_orig_pdf:
"""写源pdf"""
md_writer.write(
content=pdf_bytes,
path=f"{pdf_file_name}_origin.pdf",
mode=AbsReaderWriter.MODE_BIN,
)
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE) content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
'''写content_list'''
md_writer.write( if f_dump_content_list:
content=json_parse.dumps(content_list, ensure_ascii=False, indent=4), """写content_list"""
path=f"{pdf_file_name}_content_list.json", md_writer.write(
mode=AbsReaderWriter.MODE_TXT content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
) path=f"{pdf_file_name}_content_list.json",
mode=AbsReaderWriter.MODE_TXT,
)
@click.group() @click.group()
...@@ -177,13 +205,14 @@ def json_command(json, method): ...@@ -177,13 +205,14 @@ def json_command(json, method):
s3_file_path = jso.get("path") s3_file_path = jso.get("path")
pdf_file_name = Path(s3_file_path).stem pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path) pdf_data = read_s3_path(s3_file_path)
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method) local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter( local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir local_md_dir
) )
_do_parse( do_parse(
pdf_file_name, pdf_file_name,
pdf_data, pdf_data,
jso["doc_layout_result"], jso["doc_layout_result"],
...@@ -191,7 +220,7 @@ def json_command(json, method): ...@@ -191,7 +220,7 @@ def json_command(json, method):
local_image_rw, local_image_rw,
local_md_rw, local_md_rw,
os.path.basename(local_image_dir), os.path.basename(local_image_dir),
local_md_dir local_md_dir,
) )
...@@ -235,11 +264,11 @@ def local_json_command(local_json, method): ...@@ -235,11 +264,11 @@ def local_json_command(local_json, method):
pdf_data = read_s3_path(s3_file_path) pdf_data = read_s3_path(s3_file_path)
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method) local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter( local_image_rw, local_md_rw = DiskReaderWriter(
local_md_dir local_image_dir
) ), DiskReaderWriter(local_md_dir)
_do_parse( do_parse(
pdf_file_name, pdf_file_name,
pdf_data, pdf_data,
jso["doc_layout_result"], jso["doc_layout_result"],
...@@ -247,7 +276,7 @@ def local_json_command(local_json, method): ...@@ -247,7 +276,7 @@ def local_json_command(local_json, method):
local_image_rw, local_image_rw,
local_md_rw, local_md_rw,
os.path.basename(local_image_dir), os.path.basename(local_image_dir),
local_md_dir local_md_dir,
) )
...@@ -274,7 +303,9 @@ def pdf_command(pdf, model, method): ...@@ -274,7 +303,9 @@ def pdf_command(pdf, model, method):
if model_path is None: if model_path is None:
model_path = pdf.replace(".pdf", ".json") model_path = pdf.replace(".pdf", ".json")
if not os.path.exists(model_path): if not os.path.exists(model_path):
logger.warning(f"not found json {model_path} existed, use paddle analyze") logger.warning(
f"not found json {model_path} existed, use paddle analyze"
)
# 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
model_json = "[]" model_json = "[]"
else: else:
...@@ -286,11 +317,12 @@ def pdf_command(pdf, model, method): ...@@ -286,11 +317,12 @@ def pdf_command(pdf, model, method):
jso = json_parse.loads(get_model_json(model)) jso = json_parse.loads(get_model_json(model))
pdf_file_name = Path(pdf).stem pdf_file_name = Path(pdf).stem
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method) local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter( local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir local_md_dir
) )
_do_parse( do_parse(
pdf_file_name, pdf_file_name,
pdf_data, pdf_data,
jso, jso,
...@@ -298,7 +330,7 @@ def pdf_command(pdf, model, method): ...@@ -298,7 +330,7 @@ def pdf_command(pdf, model, method):
local_image_rw, local_image_rw,
local_md_rw, local_md_rw,
os.path.basename(local_image_dir), os.path.basename(local_image_dir),
local_md_dir local_md_dir,
) )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment