Commit 55cba1f4 authored by 许瑞's avatar 许瑞

feat: impl cli

parent ff8f62aa
...@@ -21,28 +21,97 @@ python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350 ...@@ -21,28 +21,97 @@ python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
""" """
import click
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (
parse_s3path,
parse_s3_range_params,
remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import get_local_dir
from magic_pdf.io.S3ReaderWriter import S3ReaderWriter, MODE_BIN
from magic_pdf.io.DiskReaderWriter import DiskReaderWriter
from magic_pdf.spark.spark_api import parse_union_pdf
import os
import json as json_parse
from datetime import datetime
def prepare_env():
local_parent_dir = os.path.join(
get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
)
local_image_dir = os.path.join(local_parent_dir, "images")
local_md_dir = os.path.join(local_parent_dir, "md")
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
return local_image_dir, local_md_dir
import click
@click.group() @click.group()
def cli(): def cli():
pass pass
@cli.command() @cli.command()
@click.option('--json', type=str, help='输入一个S3路径') @click.option("--json", type=str, help="输入一个S3路径")
def json_command(json): def json_command(json):
# 这里处理json相关的逻辑 if not json.startswith("s3://"):
print(f'处理JSON: {json}') print("usage: python magipdf.py --json s3://some_bucket/some_path")
os.exit(1)
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
)
may_range_params = parse_s3_range_params(json)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None
else:
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
return s3_rw.read_jsonl(
remove_non_official_s3_args(s3path), byte_start, byte_end, MODE_BIN
)
jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
pdf_data = read_s3_path(jso["file_location"])
local_image_dir, _ = prepare_env()
local_image_rw = DiskReaderWriter(local_image_dir)
parse_union_pdf(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True)
@cli.command() @cli.command()
@click.option('--pdf', type=click.Path(exists=True), required=True, help='PDF文件的路径') @click.option(
@click.option('--model', type=click.Path(exists=True), help='模型的路径') "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
)
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
def pdf_command(pdf, model): def pdf_command(pdf, model):
# 这里处理pdf和模型相关的逻辑 # 这里处理pdf和模型相关的逻辑
print(f'处理PDF: {pdf}') if model is None:
print(f'加载模型: {model}') model = pdf.replace(".pdf", ".json")
if not os.path.exists(model):
print(f"make sure json file existed and place under {os.dirname(pdf)}")
os.eixt(1)
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), MODE_BIN)
pdf_data = read_fn(pdf)
jso = json_parse.loads(read_fn(model).decode("utf-8"))
local_image_dir, _ = prepare_env()
local_image_rw = DiskReaderWriter(local_image_dir)
parse_union_pdf(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True)
if __name__ == '__main__': if __name__ == "__main__":
"""
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/format/v070/part-66028dd46437-000076.jsonl?bytes=0,308393
"""
cli() cli()
...@@ -5,9 +5,11 @@ from loguru import logger ...@@ -5,9 +5,11 @@ from loguru import logger
MODE_TXT = "text" MODE_TXT = "text"
MODE_BIN = "binary" MODE_BIN = "binary"
class DiskReaderWriter(AbsReaderWriter): class DiskReaderWriter(AbsReaderWriter):
def __init__(self, parent_path, encoding='utf-8'): def __init__(self, parent_path, encoding="utf-8"):
self.path = parent_path self.path = parent_path
self.encoding = encoding self.encoding = encoding
...@@ -20,10 +22,10 @@ class DiskReaderWriter(AbsReaderWriter): ...@@ -20,10 +22,10 @@ class DiskReaderWriter(AbsReaderWriter):
logger.error(f"文件 {abspath} 不存在") logger.error(f"文件 {abspath} 不存在")
raise Exception(f"文件 {abspath} 不存在") raise Exception(f"文件 {abspath} 不存在")
if mode == MODE_TXT: if mode == MODE_TXT:
with open(abspath, 'r', encoding = self.encoding) as f: with open(abspath, "r", encoding=self.encoding) as f:
return f.read() return f.read()
elif mode == MODE_BIN: elif mode == MODE_BIN:
with open(abspath, 'rb') as f: with open(abspath, "rb") as f:
return f.read() return f.read()
else: else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.") raise ValueError("Invalid mode. Use 'text' or 'binary'.")
...@@ -34,20 +36,21 @@ class DiskReaderWriter(AbsReaderWriter): ...@@ -34,20 +36,21 @@ class DiskReaderWriter(AbsReaderWriter):
else: else:
abspath = os.path.join(self.path, path) abspath = os.path.join(self.path, path)
if mode == MODE_TXT: if mode == MODE_TXT:
with open(abspath, 'w', encoding=self.encoding) as f: with open(abspath, "w", encoding=self.encoding) as f:
f.write(content) f.write(content)
logger.info(f"内容已成功写入 {abspath}") logger.info(f"内容已成功写入 {abspath}")
elif mode == MODE_BIN: elif mode == MODE_BIN:
with open(abspath, 'wb') as f: with open(abspath, "wb") as f:
f.write(content) f.write(content)
logger.info(f"内容已成功写入 {abspath}") logger.info(f"内容已成功写入 {abspath}")
else: else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.") raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding='utf-8'): def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding="utf-8"):
return self.read(path) return self.read(path)
# 使用示例 # 使用示例
if __name__ == "__main__": if __name__ == "__main__":
file_path = "io/example.txt" file_path = "io/example.txt"
...@@ -60,5 +63,3 @@ if __name__ == "__main__": ...@@ -60,5 +63,3 @@ if __name__ == "__main__":
content = drw.read(path=file_path) content = drw.read(path=file_path)
if content: if content:
logger.info(f"从 {file_path} 读取的内容: {content}") logger.info(f"从 {file_path} 读取的内容: {content}")
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
根据bucket的名字返回对应的s3 AK, SK,endpoint三元组 根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
""" """
import json import json
import os import os
...@@ -10,11 +11,7 @@ from loguru import logger ...@@ -10,11 +11,7 @@ from loguru import logger
from magic_pdf.libs.commons import parse_bucket_key from magic_pdf.libs.commons import parse_bucket_key
def get_s3_config(bucket_name: str): def read_config():
"""
~/magic-pdf.json 读出来
"""
home_dir = os.path.expanduser("~") home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, "magic-pdf.json") config_file = os.path.join(home_dir, "magic-pdf.json")
...@@ -24,6 +21,14 @@ def get_s3_config(bucket_name: str): ...@@ -24,6 +21,14 @@ def get_s3_config(bucket_name: str):
with open(config_file, "r") as f: with open(config_file, "r") as f:
config = json.load(f) config = json.load(f)
return config
def get_s3_config(bucket_name: str):
"""
~/magic-pdf.json 读出来
"""
config = read_config()
bucket_info = config.get("bucket_info") bucket_info = config.get("bucket_info")
if bucket_name not in bucket_info: if bucket_name not in bucket_info:
...@@ -49,5 +54,10 @@ def get_bucket_name(path): ...@@ -49,5 +54,10 @@ def get_bucket_name(path):
return bucket return bucket
if __name__ == '__main__': def get_local_dir():
config = read_config()
return config.get("temp-output-dir", "/tmp")
if __name__ == "__main__":
ak, sk, endpoint = get_s3_config("llm-raw") ak, sk, endpoint = get_s3_config("llm-raw")
from s3pathlib import S3Path
def remove_non_official_s3_args(s3path):
"""
example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
"""
arr = s3path.split("?")
return arr[0]
def parse_s3path(s3path: str):
p = S3Path(remove_non_official_s3_args(s3path))
return p.bucket, p.key
def parse_s3_range_params(s3path: str):
"""
example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350]
"""
arr = s3path.split("?bytes=")
if len(arr) == 1:
return None
return arr[1].split(",")
...@@ -15,4 +15,5 @@ wordninja>=2.0.0 ...@@ -15,4 +15,5 @@ wordninja>=2.0.0
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
zh_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.7.0/zh_core_web_sm-3.7.0-py3-none-any.whl zh_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.7.0/zh_core_web_sm-3.7.0-py3-none-any.whl
scikit-learn==1.4.1.post1 scikit-learn==1.4.1.post1
nltk==3.8.1 nltk==3.8.1
\ No newline at end of file s3pathlib>=2.1.1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment