Commit f52c6249 authored by 赵小蒙's avatar 赵小蒙

更新路径输入和markdown输出逻辑

parent ca7059e5
import json
import os
import sys
from pathlib import Path
......@@ -6,7 +7,7 @@ import click
from loguru import logger
from magic_pdf.libs.commons import join_path, read_file
from magic_pdf.dict2md.mkcontent import mk_mm_markdown
from magic_pdf.dict2md.mkcontent import mk_mm_markdown, mk_universal_format
from magic_pdf.pipeline import parse_pdf_by_model
......@@ -32,7 +33,8 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
os.makedirs(parent_dir)
if not paras_dict.get('need_drop'):
markdown_content = mk_mm_markdown(paras_dict)
content_list = mk_universal_format(paras_dict)
markdown_content = mk_mm_markdown(content_list)
else:
markdown_content = paras_dict['drop_reason']
......@@ -70,8 +72,8 @@ def main_shell(pdf_file_path: str, save_path: str):
@click.command()
@click.option("--pdf-dir", help="s3上pdf文件的路径")
@click.option("--model-dir", help="s3上pdf文件的路径")
@click.option("--pdf-dir", help="本地pdf文件的路径")
@click.option("--model-dir", help="本地模型文件的路径")
@click.option("--start-page-num", default=0, help="从第几页开始解析")
def main_shell2(pdf_dir: str, model_dir: str,start_page_num: int):
# 先扫描所有的pdf目录里的文件名字
......@@ -86,8 +88,10 @@ def main_shell2(pdf_dir: str, model_dir: str,start_page_num: int):
for pdf_file in pdf_file_names:
pdf_file_path = os.path.join(pdf_dir, pdf_file)
model_file_path = os.path.join(model_dir, pdf_file)
main(pdf_file_path, None, model_file_path, None, start_page_num)
model_file_path = os.path.join(model_dir, pdf_file).rstrip(".pdf") + ".json"
with open(model_file_path, "r") as json_file:
model_list = json.load(json_file)
main(pdf_file_path, None, model_list, None, start_page_num)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment