Commit 2e79da59 authored by quyuan's avatar quyuan

add ci

parent 65e83285
......@@ -37,6 +37,4 @@ jobs:
echo "start test"
cd $GITHUB_WORKSPACE/tests/benchmark/
tree
sh env.sh
python benchmark.py
......@@ -18,7 +18,7 @@ def test_cli():
rm_cmd = f"rm -rf {pdf_res_path}"
os.system(rm_cmd)
os.makedirs(pdf_res_path)
cmd = f'magic-pdf pdf-command --pdf {os.path.join(pdf_dev_path, "mineru")} --inside_model true'
cmd = f'magic-pdf pdf-command --pdf {os.path.join(pdf_dev_path, "mineru")}'
os.system(cmd)
for root, dirs, files in os.walk(pdf_res_path):
for magic_file in files:
......
conda create -n MinerU python=3.10
conda activate MinerU
pip install magic-pdf
pip install magic-pdf[full-cpu]
pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
git lfs install
git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit
#cp magic-pdf.template.json ~/magic-pdf.json
\ No newline at end of file
......@@ -4,6 +4,13 @@ from conf import conf
import subprocess
from lib import common
import logging
import os
import json
from loguru import logger
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
pdf_res_path = conf.conf["pdf_res_path"]
code_path = conf.conf["code_path"]
pdf_dev_path = conf.conf["pdf_dev_path"]
......@@ -18,6 +25,29 @@ class TestCli:
common.check_shell(cmd)
#common.count_folders_and_check_contents(pdf_res_path)
def test_pdf_sdk(self):
"""
pdf sdk 方式解析
"""
demo_names = list()
for pdf_file in os.listdir(pdf_dev_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
model_path = os.path.join(pdf_dev_path, f"{demo_name}.json")
pdf_path = os.path.join(pdf_dev_path, f"{demo_name}.pdf")
pdf_bytes = open(pdf_path, "rb").read()
model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
image_writer = DiskReaderWriter(pdf_dev_path)
image_dir = str(os.path.basename(pdf_dev_path))
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
f.write(md_content)
# def test_pdf_specify_jsonl(self):
# """
# 输入jsonl, 默认方式解析
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment