Commit ef0129ad authored by kernel.h@qq.com's avatar kernel.h@qq.com

修改pdf的路径

parent ed40e1d5
......@@ -23,9 +23,9 @@ python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloa
import os
import json as json_parse
from datetime import datetime
import click
from loguru import logger
from pathlib import Path
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe
......@@ -44,9 +44,9 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
def prepare_env():
def prepare_env(pdf_file_name):
local_parent_dir = os.path.join(
get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d")
get_local_dir(), "magic-pdf",pdf_file_name
)
local_image_dir = os.path.join(local_parent_dir, "images")
......@@ -56,7 +56,7 @@ def prepare_env():
return local_image_dir, local_md_dir
def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir):
def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir):
if parse_method == "auto":
pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
elif parse_method == "txt":
......@@ -70,13 +70,13 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
pipe.pipe_classify()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown()
part_file_name = datetime.now().strftime("%H-%M-%S")
#part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer.write(
content=md_content, path=f"{part_file_name}.md", mode=AbsReaderWriter.MODE_TXT
content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
)
md_writer.write(
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
path=f"{part_file_name}.json",
path=f"{pdf_file_name}.json",
mode=AbsReaderWriter.MODE_TXT,
)
# try:
......@@ -127,14 +127,17 @@ def json_command(json, method):
)
jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
pdf_data = read_s3_path(jso["file_location"])
local_image_dir, local_md_dir = prepare_env()
s3_file_path = jso["file_location"]
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
local_image_dir, local_md_dir = prepare_env(pdf_file_name)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
_do_parse(
pdf_file_name,
pdf_data,
jso["doc_layout_result"],
method,
......@@ -169,11 +172,13 @@ def pdf_command(pdf, model, method):
pdf_data = read_fn(pdf)
jso = json_parse.loads(read_fn(model).decode("utf-8"))
local_image_dir, local_md_dir = prepare_env()
pdf_file_name = Path(pdf).stem
local_image_dir, local_md_dir = prepare_env(pdf_file_name)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
_do_parse(
pdf_file_name,
pdf_data,
jso,
method,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment