Unverified Commit 0aa45778 authored by yyy's avatar yyy Committed by GitHub

feat: add test case (#645)

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

* feat: add table case

---------
Co-authored-by: 's avatarquyuan <quyuan@pjlab.org>
parent 24c143fe
...@@ -10,7 +10,6 @@ on: ...@@ -10,7 +10,6 @@ on:
paths-ignore: paths-ignore:
- "cmds/**" - "cmds/**"
- "**.md" - "**.md"
- "**.yml"
pull_request: pull_request:
branches: branches:
- "master" - "master"
...@@ -18,12 +17,11 @@ on: ...@@ -18,12 +17,11 @@ on:
paths-ignore: paths-ignore:
- "cmds/**" - "cmds/**"
- "**.md" - "**.md"
- "**.yml"
workflow_dispatch: workflow_dispatch:
jobs: jobs:
cli-test: cli-test:
runs-on: pdf runs-on: pdf
timeout-minutes: 120 timeout-minutes: 240
strategy: strategy:
fail-fast: true fail-fast: true
...@@ -33,17 +31,16 @@ jobs: ...@@ -33,17 +31,16 @@ jobs:
with: with:
fetch-depth: 2 fetch-depth: 2
- name: install - name: install&test
run: | run: |
echo $GITHUB_WORKSPACE && sh tests/retry_env.sh source activate mineru
- name: unit test conda env list
run: | pip show coverage
cd $GITHUB_WORKSPACE && python tests/clean_coverage.py # cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
cd $GITHUB_WORKSPACE && export PYTHONPATH=. && coverage run -m pytest tests/unittest --cov=magic_pdf/ --cov-report term-missing --cov-report html cd $GITHUB_WORKSPACE && python tests/clean_coverage.py
cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing
cd $GITHUB_WORKSPACE && python tests/get_coverage.py cd $GITHUB_WORKSPACE && python tests/get_coverage.py
- name: cli test cd $GITHUB_WORKSPACE && pytest -m P0 -s -v tests/test_cli/test_cli_sdk.py
run: |
source ~/.bashrc && cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli.py
notify_to_feishu: notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }} if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
......
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: mineru
on:
schedule:
- cron: '0 22 * * *' # 每天晚上 10 点执行
jobs:
cli-test:
runs-on: pdf
timeout-minutes: 240
strategy:
fail-fast: true
steps:
- name: PDF cli
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: install&test
run: |
source activate mineru
conda env list
pip show coverage
# cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
cd $GITHUB_WORKSPACE && python tests/clean_coverage.py
cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing
cd $GITHUB_WORKSPACE && python tests/get_coverage.py
cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli_sdk.py
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
needs: cli-test
runs-on: pdf
steps:
- name: get_actor
run: |
metion_list="dt-yy"
echo $GITHUB_ACTOR
if [[ $GITHUB_ACTOR == "drunkpig" ]]; then
metion_list="xuchao"
elif [[ $GITHUB_ACTOR == "myhloli" ]]; then
metion_list="zhaoxiaomeng"
elif [[ $GITHUB_ACTOR == "icecraft" ]]; then
metion_list="xurui1"
fi
echo $metion_list
echo "METIONS=$metion_list" >> "$GITHUB_ENV"
echo ${{ env.METIONS }}
- name: notify
run: |
echo ${{ secrets.USER_ID }}
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: mineru
on:
push:
branches:
- "master"
- "dev"
paths-ignore:
- "cmds/**"
- "**.md"
workflow_dispatch:
jobs:
cli-test:
runs-on: pdf
timeout-minutes: 240
strategy:
fail-fast: true
steps:
- name: PDF cli
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: install&test
run: |
source activate mineru
conda env list
pip show coverage
# cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
cd $GITHUB_WORKSPACE && python tests/clean_coverage.py
cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing
cd $GITHUB_WORKSPACE && python tests/get_coverage.py
cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli_sdk.py
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
needs: cli-test
runs-on: pdf
steps:
- name: get_actor
run: |
metion_list="dt-yy"
echo $GITHUB_ACTOR
if [[ $GITHUB_ACTOR == "drunkpig" ]]; then
metion_list="xuchao"
elif [[ $GITHUB_ACTOR == "myhloli" ]]; then
metion_list="zhaoxiaomeng"
elif [[ $GITHUB_ACTOR == "icecraft" ]]; then
metion_list="xurui1"
fi
echo $metion_list
echo "METIONS=$metion_list" >> "$GITHUB_ENV"
echo ${{ env.METIONS }}
- name: notify
run: |
echo ${{ secrets.USER_ID }}
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: update-base
on:
push:
tags:
- '*released'
workflow_dispatch:
jobs:
pdf-test:
runs-on: pdf
timeout-minutes: 40
steps:
- name: update-base
uses: actions/checkout@v3
- name: start-update
run: |
echo "start test"
*.tar *.tar
*.tar.gz *.tar.gz
*.zip
venv*/ venv*/
envs/ envs/
slurm_logs/ slurm_logs/
...@@ -31,7 +32,7 @@ tmp ...@@ -31,7 +32,7 @@ tmp
.vscode .vscode
.vscode/ .vscode/
ocr_demo ocr_demo
.coveragerc
/app/common/__init__.py /app/common/__init__.py
/magic_pdf/config/__init__.py /magic_pdf/config/__init__.py
source.dev.env source.dev.env
......
...@@ -16,4 +16,5 @@ pypandoc ...@@ -16,4 +16,5 @@ pypandoc
pyopenssl==24.0.0 pyopenssl==24.0.0
struct-eqtable==0.1.0 struct-eqtable==0.1.0
pytest-cov pytest-cov
beautifulsoup4 beautifulsoup4
\ No newline at end of file coverage
\ No newline at end of file
...@@ -21,4 +21,5 @@ def delete_file(path): ...@@ -21,4 +21,5 @@ def delete_file(path):
print(f"Error deleting directory '{path}': {e}") print(f"Error deleting directory '{path}': {e}")
if __name__ == "__main__": if __name__ == "__main__":
delete_file("htmlcov") delete_file("htmlcov/")
\ No newline at end of file #delete_file(".coverage")
#!/bin/bash #!/bin/bash
# 定义最大重试次数
max_retries=5 max_retries=5
retry_count=0 retry_count=0
while true; do while true; do
# prepare env # prepare env
source activate MinerU #python -m pip install -r requirements-qa.txt
pip install -r requirements-qa.txt python -m pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
pip uninstall magic-pdf python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
exit_code=$? exit_code=$?
if [ $exit_code -eq 0 ]; then if [ $exit_code -eq 0 ]; then
echo "test.sh 成功执行!" echo "test.sh 成功执行!"
...@@ -22,6 +19,6 @@ while true; do ...@@ -22,6 +19,6 @@ while true; do
exit 1 exit 1
fi fi
echo "test.sh 执行失败 (退出码: $exit_code)。尝试第 $retry_count 次重试..." echo "test.sh 执行失败 (退出码: $exit_code)。尝试第 $retry_count 次重试..."
sleep 5 # 等待 5 秒后重试 sleep 5
fi fi
done done
...@@ -4,5 +4,5 @@ conf = { ...@@ -4,5 +4,5 @@ conf = {
"pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev", "pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
"pdf_res_path": "/tmp/magic-pdf", "pdf_res_path": "/tmp/magic-pdf",
"jsonl_path": "s3://llm-qatest-pnorm/mineru/test/line1.jsonl", "jsonl_path": "s3://llm-qatest-pnorm/mineru/test/line1.jsonl",
"s3_pdf_path": "s3://llm-qatest-pnorm/mineru/test/test.pdf" "s3_pdf_path": "s3://llm-qatest-pnorm/mineru/test/test_rearch_report.pdf"
} }
\ No newline at end of file
import pytest
import torch
def clear_gpu_memory():
'''
clear GPU memory
'''
torch.cuda.empty_cache()
print("GPU memory cleared.")
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_runtest_teardown(item, nextitem):
'''
clear GPU memory after each test
'''
yield
clear_gpu_memory()
\ No newline at end of file
"""common definitions.""" """common definitions."""
import os import os
import shutil import shutil
import re
import json
def check_shell(cmd): def check_shell(cmd):
"""shell successful.""" """shell successful."""
res = os.system(cmd) res = os.system(cmd)
assert res == 0 assert res == 0
def update_config_file(file_path, key, value):
"""update config file."""
with open(file_path, 'r', encoding="utf-8") as f:
config = json.loads(f.read())
config[key] = value
with open(file_path, 'w', encoding="utf-8") as f:
f.write(json.dumps(config))
def cli_count_folders_and_check_contents(file_path): def cli_count_folders_and_check_contents(file_path):
"""" count cli files.""" """" count cli files."""
...@@ -40,4 +47,33 @@ def delete_file(path): ...@@ -40,4 +47,33 @@ def delete_file(path):
shutil.rmtree(path) shutil.rmtree(path)
print(f"Directory '{path}' and its contents deleted.") print(f"Directory '{path}' and its contents deleted.")
except TypeError as e: except TypeError as e:
print(f"Error deleting directory '{path}': {e}") print(f"Error deleting directory '{path}': {e}")
\ No newline at end of file
def check_latex_table_exists(file_path):
"""check latex table exists."""
pattern = r'\\begin\{tabular\}.*?\\end\{tabular\}'
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
matches = re.findall(pattern, content, re.DOTALL)
return len(matches) > 0
def check_html_table_exists(file_path):
"""check html table exists."""
pattern = r'<table.*?>.*?</table>'
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
matches = re.findall(pattern, content, re.DOTALL)
return len(matches) > 0
def check_close_tables(file_path):
"""delete no tables."""
latex_pattern = r'\\begin\{tabular\}.*?\\end\{tabular\}'
html_pattern = r'<table.*?>.*?</table>'
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
latex_matches = re.findall(latex_pattern, content, re.DOTALL)
html_matches = re.findall(html_pattern, content, re.DOTALL)
if len(latex_matches) == 0 and len(html_matches) == 0:
return True
else:
return False
\ No newline at end of file
"""test cli and sdk.""" """test cli and sdk."""
import logging import logging
import os import os
import pytest import pytest
from conf import conf from conf import conf
from lib import common from lib import common
import time
import magic_pdf.model as model_config import magic_pdf.model as model_config
from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
...@@ -57,6 +56,7 @@ class TestCli: ...@@ -57,6 +56,7 @@ class TestCli:
@pytest.mark.P0 @pytest.mark.P0
def test_pdf_ocr_sdk(self): def test_pdf_ocr_sdk(self):
"""pdf sdk ocr test.""" """pdf sdk ocr test."""
time.sleep(2)
demo_names = list() demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf') pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path): for pdf_file in os.listdir(pdf_path):
...@@ -88,10 +88,11 @@ class TestCli: ...@@ -88,10 +88,11 @@ class TestCli:
with open(res_path, 'w+', encoding='utf-8') as f: with open(res_path, 'w+', encoding='utf-8') as f:
f.write(md_content) f.write(md_content)
common.sdk_count_folders_and_check_contents(res_path) common.sdk_count_folders_and_check_contents(res_path)
@pytest.mark.P0 @pytest.mark.P0
def test_pdf_txt_sdk(self): def test_pdf_txt_sdk(self):
"""pdf sdk txt test.""" """pdf sdk txt test."""
time.sleep(2)
demo_names = list() demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf') pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path): for pdf_file in os.listdir(pdf_path):
...@@ -99,7 +100,6 @@ class TestCli: ...@@ -99,7 +100,6 @@ class TestCli:
demo_names.append(pdf_file.split('.')[0]) demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names: for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf') pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
print(pdf_path)
pdf_bytes = open(pdf_path, 'rb').read() pdf_bytes = open(pdf_path, 'rb').read()
local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images') local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
image_dir = str(os.path.basename(local_image_dir)) image_dir = str(os.path.basename(local_image_dir))
...@@ -123,10 +123,11 @@ class TestCli: ...@@ -123,10 +123,11 @@ class TestCli:
with open(res_path, 'w+', encoding='utf-8') as f: with open(res_path, 'w+', encoding='utf-8') as f:
f.write(md_content) f.write(md_content)
common.sdk_count_folders_and_check_contents(res_path) common.sdk_count_folders_and_check_contents(res_path)
@pytest.mark.P0 @pytest.mark.P0
def test_pdf_cli_auto(self): def test_pdf_cli_auto(self):
"""magic_pdf cli test auto.""" """magic_pdf cli test auto."""
time.sleep(2)
demo_names = [] demo_names = []
pdf_path = os.path.join(pdf_dev_path, 'pdf') pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path): for pdf_file in os.listdir(pdf_path):
...@@ -141,10 +142,11 @@ class TestCli: ...@@ -141,10 +142,11 @@ class TestCli:
os.system(cmd) os.system(cmd)
common.cli_count_folders_and_check_contents( common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'auto')) os.path.join(res_path, demo_name, 'auto'))
@pytest.mark.P0 @pytest.mark.P0
def test_pdf_clit_txt(self): def test_pdf_cli_txt(self):
"""magic_pdf cli test txt.""" """magic_pdf cli test txt."""
time.sleep(2)
demo_names = [] demo_names = []
pdf_path = os.path.join(pdf_dev_path, 'pdf') pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path): for pdf_file in os.listdir(pdf_path):
...@@ -159,10 +161,11 @@ class TestCli: ...@@ -159,10 +161,11 @@ class TestCli:
os.system(cmd) os.system(cmd)
common.cli_count_folders_and_check_contents( common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'txt')) os.path.join(res_path, demo_name, 'txt'))
@pytest.mark.P0 @pytest.mark.P0
def test_pdf_clit_ocr(self): def test_pdf_cli_ocr(self):
"""magic_pdf cli test ocr.""" """magic_pdf cli test ocr."""
time.sleep(2)
demo_names = [] demo_names = []
pdf_path = os.path.join(pdf_dev_path, 'pdf') pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path): for pdf_file in os.listdir(pdf_path):
...@@ -177,85 +180,102 @@ class TestCli: ...@@ -177,85 +180,102 @@ class TestCli:
os.system(cmd) os.system(cmd)
common.cli_count_folders_and_check_contents( common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'ocr')) os.path.join(res_path, demo_name, 'ocr'))
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1 @pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_txt(self): def test_pdf_dev_cli_local_jsonl_txt(self):
"""magic_pdf_dev cli local txt.""" """magic_pdf_dev cli local txt."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl') jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt") cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
logging.info(cmd) logging.info(cmd)
os.system(cmd) os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1 @pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_ocr(self): def test_pdf_dev_cli_local_jsonl_ocr(self):
"""magic_pdf_dev cli local ocr.""" """magic_pdf_dev cli local ocr."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl') jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr') cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
logging.info(cmd) logging.info(cmd)
os.system(cmd) os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1 @pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_auto(self): def test_pdf_dev_cli_local_jsonl_auto(self):
"""magic_pdf_dev cli local auto.""" """magic_pdf_dev cli local auto."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl') jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto') cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
logging.info(cmd) logging.info(cmd)
os.system(cmd) os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1 @pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_txt(self): def test_pdf_dev_cli_s3_jsonl_txt(self):
"""magic_pdf_dev cli s3 txt.""" """magic_pdf_dev cli s3 txt."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl') jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt") cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
logging.info(cmd) logging.info(cmd)
os.system(cmd) os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1 @pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_ocr(self): def test_pdf_dev_cli_s3_jsonl_ocr(self):
"""magic_pdf_dev cli s3 ocr.""" """magic_pdf_dev cli s3 ocr."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl') jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr') cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
logging.info(cmd) logging.info(cmd)
os.system(cmd) os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1 @pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_auto(self): def test_pdf_dev_cli_s3_jsonl_auto(self):
"""magic_pdf_dev cli s3 auto.""" """magic_pdf_dev cli s3 auto."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl') jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto') cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
logging.info(cmd) logging.info(cmd)
os.system(cmd) os.system(cmd)
@pytest.mark.P1 @pytest.mark.P1
def test_pdf_dev_cli_pdf_json_auto(self): def test_pdf_dev_cli_pdf_json_auto(self):
"""magic_pdf_dev cli pdf+json auto.""" """magic_pdf_dev cli pdf+json auto."""
time.sleep(2)
json_path = os.path.join(pdf_dev_path, 'test_model.json') json_path = os.path.join(pdf_dev_path, 'test_model.json')
pdf_path = os.path.join(pdf_dev_path, 'pdf', 'research_report_1f978cd81fb7260c8f7644039ec2c054.pdf') pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto') cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
logging.info(cmd) logging.info(cmd)
os.system(cmd) os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1 @pytest.mark.P1
def test_pdf_dev_cli_pdf_json_ocr(self): def test_pdf_dev_cli_pdf_json_ocr(self):
"""magic_pdf_dev cli pdf+json ocr.""" """magic_pdf_dev cli pdf+json ocr."""
time.sleep(2)
json_path = os.path.join(pdf_dev_path, 'test_model.json') json_path = os.path.join(pdf_dev_path, 'test_model.json')
pdf_path = os.path.join(pdf_dev_path, 'pdf', 'research_report_1f978cd81fb7260c8f7644039ec2c054.pdf') pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto') cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
logging.info(cmd) logging.info(cmd)
os.system(cmd) os.system(cmd)
@pytest.mark.P1 @pytest.mark.P1
def test_s3_sdk_suto(self): def test_s3_sdk_suto(self):
pdf_ak = os.environ.get('pdf_ak', "") """
test s3 sdk auto.
"""
time.sleep(2)
pdf_ak = os.getenv('pdf_ak')
print (pdf_ak)
pdf_sk = os.environ.get('pdf_sk', "") pdf_sk = os.environ.get('pdf_sk', "")
pdf_bucket = os.environ.get('bucket', "") pdf_bucket = os.environ.get('bucket', "")
pdf_endpoint = os.environ.get('pdf_endpoint', "") pdf_endpoint = os.environ.get('pdf_endpoint', "")
s3_pdf_path = conf.conf["s3_pdf_path"] s3_pdf_path = conf.conf["s3_pdf_path"]
image_dir = "s3://" + pdf_bucket + "/mineru/test/test.md" image_dir = "s3://" + pdf_bucket + "/mineru/test/output"
print (image_dir)
s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint) s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
s3image_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint, parent_path=image_dir) s3image_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint, parent_path=image_dir)
pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN) pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
...@@ -267,6 +287,60 @@ class TestCli: ...@@ -267,6 +287,60 @@ class TestCli:
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
assert len(md_content) > 0 assert len(md_content) > 0
@pytest.mark.P1
def test_local_magic_pdf_open_st_table(self):
"""magic pdf cli open st table."""
time.sleep(2)
pre_cmd = "cp ~/magic_pdf_st.json ~/magic-pdf.json"
print (pre_cmd)
os.system(pre_cmd)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
res = common.check_latex_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True
@pytest.mark.P1
def test_local_magic_pdf_open_html_table(self):
"""magic pdf cli open html table."""
time.sleep(2)
pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
os.system(pre_cmd)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True
@pytest.mark.P1
def test_magic_pdf_close_html_table_cpu(self):
"""magic pdf cli close html table cpu mode."""
time.sleep(2)
pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json"
os.system(pre_cmd)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True
@pytest.mark.P1
def test_local_magic_pdf_close_html_table(self):
"""magic pdf cli close table."""
time.sleep(2)
pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json"
os.system(pre_cmd)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
res = common.check_close_tables(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True
if __name__ == '__main__': if __name__ == '__main__':
pytest.main() pytest.main()
"""
test performance
"""
import os
import shutil
import json
from lib import calculate_score
import pytest
from conf import conf
code_path = os.environ.get('GITHUB_WORKSPACE')
pdf_dev_path = conf.conf["pdf_dev_path"]
pdf_res_path = conf.conf["pdf_res_path"]
class TestTable():
"""
test table
"""
def test_perf_close_table(self):
"""
test perf when close table
"""
def get_score():
"""
get score
"""
score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json"))
score.calculate_similarity_total("mineru", pdf_dev_path)
res = score.summary_scores()
return res
"""
test table case
"""
import os
import shutil
import json
from lib import calculate_score
import pytest
from conf import conf
code_path = os.environ.get('GITHUB_WORKSPACE')
pdf_dev_path = conf.conf["pdf_dev_path"]
pdf_res_path = conf.conf["pdf_res_path"]
class TestTable():
"""
test table
"""
def test_paddle_table_master_cuda(self):
"""
select table: paddle table master,mode is cuda
"""
def test_paddle_table_master_cpu(self):
"""
select table: paddle table master, mode is cpu
"""
def test_st_table_cuda(self):
"""
select table: ST, mode is cuda
"""
def test_st_table_cpu(self):
"""
select table: ST, mode is cpu
"""
def test_close_table_cuda(self):
"""
close table, mode is cuda
"""
def get_score():
"""
get score
"""
score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json"))
score.calculate_similarity_total("mineru", pdf_dev_path)
res = score.summary_scores()
return res
...@@ -7,7 +7,7 @@ class TestppTableModel: ...@@ -7,7 +7,7 @@ class TestppTableModel:
img = Image.open("tests/unittest/test_table/assets/table.jpg") img = Image.open("tests/unittest/test_table/assets/table.jpg")
# 修改table模型路径 # 修改table模型路径
config = {"device": "cuda", config = {"device": "cuda",
"model_dir": "/home/quyuan/PDF-Extract-Kit/models/TabRec/TableMaster"} "model_dir": "/home/quyuan/.cache/modelscope/hub/opendatalab/PDF-Extract-Kit/models/TabRec/TableMaster"}
table_model = ppTableModel(config) table_model = ppTableModel(config)
res = table_model.img2html(img) res = table_model.img2html(img)
true_value = """<td><table border="1"><thead><tr><td><b>Methods</b></td><td><b>R</b></td><td><b>P</b></td><td><b>F</b></td><td><b>FPS</b></td></tr></thead><tbody><tr><td>SegLink [26]</td><td>70.0</td><td>86.0</td><td>77.0</td><td>8.9</td></tr><tr><td>PixelLink [4]</td><td>73.2</td><td>83.0</td><td>77.8</td><td>-</td></tr><tr><td>TextSnake [18]</td><td>73.9</td><td>83.2</td><td>78.3</td><td>1.1</td></tr><tr><td>TextField [37]</td><td>75.9</td><td>87.4</td><td>81.3</td><td>5.2 </td></tr><tr><td>MSR[38]</td><td>76.7</td><td>87.4</td><td>81.7</td><td>-</td></tr><tr><td>FTSN[3]</td><td>77.1</td><td>87.6</td><td>82.0</td><td>-</td></tr><tr><td>LSE[30]</td><td>81.7</td><td>84.2</td><td>82.9</td><td>-</td></tr><tr><td>CRAFT [2]</td><td>78.2</td><td>88.2</td><td>82.9</td><td>8.6</td></tr><tr><td>MCN [16]</td><td>79</td><td>88.</td><td>83</td><td>-</td></tr><tr><td>ATRR[35]</td><td>82.1</td><td>85.2</td><td>83.6</td><td>-</td></tr><tr><td>PAN [34]</td><td>83.8</td><td>84.4</td><td>84.1</td><td>30.2</td></tr><tr><td>DB[12]</td><td>79.2</td><td>91.5</td><td>84.9</td><td>32.0</td></tr><tr><td>DRRG [41]</td><td>82.30</td><td>88.05</td><td>85.08</td><td>-</td></tr><tr><td>Ours (SynText)</td><td>80.68</td><td>85.40</td><td>82.97</td><td>12.68</td></tr><tr><td>Ours (MLT-17)</td><td>84.54</td><td>86.62</td><td>85.57</td><td>12.31</td></tr></tbody></table></td>\n""" true_value = """<td><table border="1"><thead><tr><td><b>Methods</b></td><td><b>R</b></td><td><b>P</b></td><td><b>F</b></td><td><b>FPS</b></td></tr></thead><tbody><tr><td>SegLink [26]</td><td>70.0</td><td>86.0</td><td>77.0</td><td>8.9</td></tr><tr><td>PixelLink [4]</td><td>73.2</td><td>83.0</td><td>77.8</td><td>-</td></tr><tr><td>TextSnake [18]</td><td>73.9</td><td>83.2</td><td>78.3</td><td>1.1</td></tr><tr><td>TextField [37]</td><td>75.9</td><td>87.4</td><td>81.3</td><td>5.2 </td></tr><tr><td>MSR[38]</td><td>76.7</td><td>87.4</td><td>81.7</td><td>-</td></tr><tr><td>FTSN[3]</td><td>77.1</td><td>87.6</td><td>82.0</td><td>-</td></tr><tr><td>LSE[30]</td><td>81.7</td><td>84.2</td><td>82.9</td><td>-</td></tr><tr><td>CRAFT [2]</td><td>78.2</td><td>88.2</td><td>82.9</td><td>8.6</td></tr><tr><td>MCN [16]</td><td>79</td><td>88.</td><td>83</td><td>-</td></tr><tr><td>ATRR[35]</td><td>82.1</td><td>85.2</td><td>83.6</td><td>-</td></tr><tr><td>PAN [34]</td><td>83.8</td><td>84.4</td><td>84.1</td><td>30.2</td></tr><tr><td>DB[12]</td><td>79.2</td><td>91.5</td><td>84.9</td><td>32.0</td></tr><tr><td>DRRG [41]</td><td>82.30</td><td>88.05</td><td>85.08</td><td>-</td></tr><tr><td>Ours (SynText)</td><td>80.68</td><td>85.40</td><td>82.97</td><td>12.68</td></tr><tr><td>Ours (MLT-17)</td><td>84.54</td><td>86.62</td><td>85.57</td><td>12.31</td></tr></tbody></table></td>\n"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment