Commit 4c37e741 authored by 许瑞's avatar 许瑞

feat: support multiple pdf parse method

parent 55cba1f4
...@@ -31,12 +31,23 @@ from magic_pdf.libs.path_utils import ( ...@@ -31,12 +31,23 @@ from magic_pdf.libs.path_utils import (
from magic_pdf.libs.config_reader import get_local_dir from magic_pdf.libs.config_reader import get_local_dir
from magic_pdf.io.S3ReaderWriter import S3ReaderWriter, MODE_BIN from magic_pdf.io.S3ReaderWriter import S3ReaderWriter, MODE_BIN
from magic_pdf.io.DiskReaderWriter import DiskReaderWriter from magic_pdf.io.DiskReaderWriter import DiskReaderWriter
from magic_pdf.spark.spark_api import parse_union_pdf from magic_pdf.spark.spark_api import parse_union_pdf, parse_txt_pdf, parse_ocr_pdf
import os import os
import json as json_parse import json as json_parse
from datetime import datetime from datetime import datetime
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
def get_pdf_parse_method(method):
if method == "ocr":
return parse_ocr_pdf
elif method == "txt":
return parse_txt_pdf
return parse_union_pdf
def prepare_env(): def prepare_env():
local_parent_dir = os.path.join( local_parent_dir = os.path.join(
get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d-%H-%M-%S") get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
...@@ -56,7 +67,13 @@ def cli(): ...@@ -56,7 +67,13 @@ def cli():
@cli.command() @cli.command()
@click.option("--json", type=str, help="输入一个S3路径") @click.option("--json", type=str, help="输入一个S3路径")
def json_command(json): @click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
def json_command(json, method):
if not json.startswith("s3://"): if not json.startswith("s3://"):
print("usage: python magipdf.py --json s3://some_bucket/some_path") print("usage: python magipdf.py --json s3://some_bucket/some_path")
os.exit(1) os.exit(1)
...@@ -82,7 +99,8 @@ def json_command(json): ...@@ -82,7 +99,8 @@ def json_command(json):
local_image_dir, _ = prepare_env() local_image_dir, _ = prepare_env()
local_image_rw = DiskReaderWriter(local_image_dir) local_image_rw = DiskReaderWriter(local_image_dir)
parse_union_pdf(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True) parse = get_pdf_parse_method(method)
parse(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True)
@cli.command() @cli.command()
...@@ -90,7 +108,13 @@ def json_command(json): ...@@ -90,7 +108,13 @@ def json_command(json):
"--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径" "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
) )
@click.option("--model", type=click.Path(exists=True), help="模型的路径") @click.option("--model", type=click.Path(exists=True), help="模型的路径")
def pdf_command(pdf, model): @click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
def pdf_command(pdf, model, method):
# 这里处理pdf和模型相关的逻辑 # 这里处理pdf和模型相关的逻辑
if model is None: if model is None:
model = pdf.replace(".pdf", ".json") model = pdf.replace(".pdf", ".json")
...@@ -107,7 +131,8 @@ def pdf_command(pdf, model): ...@@ -107,7 +131,8 @@ def pdf_command(pdf, model):
local_image_dir, _ = prepare_env() local_image_dir, _ = prepare_env()
local_image_rw = DiskReaderWriter(local_image_dir) local_image_rw = DiskReaderWriter(local_image_dir)
parse_union_pdf(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True) parse = get_pdf_parse_method(method)
parse(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment