add ci

80e7a50e · quyuan · 2e79da59 · 80e7a50e · 2e79da59 · 2e79da59
Commit 80e7a50e authored Jul 13, 2024 by quyuan
46 changed files
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -35,6 +35,5 @@ jobs:
    - name: get-benchmark-result
      run: |
        echo "start test"
-        cd $GITHUB_WORKSPACE/tests/benchmark/ 
-        tree
+        cd $GITHUB_WORKSPACE &&  pytest -s -v tests/test_cli/test_ben.py
  
--- a/tests/benchmark/benchmark.py
+++ b/tests/benchmark/benchmark.py
-"""
-bench
-"""
-import os
-import shutil
-import json
-import calculate_score
-code_path = os.environ.get('GITHUB_WORKSPACE')
-#评测集存放路径
-pdf_dev_path = "datasets/"
-#magicpdf跑测结果
-pdf_res_path = "/tmp/magic-pdf"
-
-def test_cli():
-    """
-    test pdf-command cli
-    """
-    rm_cmd = f"rm -rf {pdf_res_path}"
-    os.system(rm_cmd)
-    os.makedirs(pdf_res_path)
-    cmd = f'magic-pdf pdf-command --pdf {os.path.join(pdf_dev_path, "mineru")}'
-    os.system(cmd)
-    for root, dirs, files in os.walk(pdf_res_path):
-         for magic_file in files:
-            target_dir = os.path.join(pdf_dev_path, "mineru")
-            if magic_file.endswith(".md"):
-                source_file = os.path.join(root, magic_file)
-                target_file = os.path.join(pdf_dev_path, "mineru", magic_file)
-                if not os.path.exists(target_dir):
-                    os.makedirs(target_dir) 
-                shutil.copy(source_file, target_file)
-
-def get_score():
-    """
-    get score
-    """
-    data_path = os.path.join(pdf_dev_path, "ci")
-    score = calculate_score.Scoring(os.path.join(data_path, "result.json"))
-    score.calculate_similarity_total("mineru", data_path)
-    res = score.summary_scores()
-    return res
-
-
-def ci_ben():
-    """
-    ci benchmark
-    """
-    try:
-        fr = open(os.path.join(pdf_dev_path, "result.json"), "r", encoding="utf-8")
-        lines = fr.readlines()
-        last_line = lines[-1].strip()
-        last_score = json.loads(last_line)
-        print ("last_score:", last_score)
-        last_simscore = last_score["average_sim_score"]
-        last_editdistance = last_score["average_edit_distance"]
-        last_bleu = last_score["average_bleu_score"]
-    except IOError:
-        print ("result.json not exist")
-    test_cli()
-    os.system(f"python pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}")
-    now_score = get_score()
-    print ("now_score:", now_score)
-    now_simscore = now_score["average_sim_score"]
-    now_editdistance = now_score["average_edit_distance"]
-    now_bleu = now_score["average_bleu_score"]
-    assert last_simscore <= now_simscore
-    assert last_editdistance <= now_editdistance
-    assert last_bleu <= now_bleu
-
-
-if __name__ == "__main__":
-    os.system("sh env.sh")
-    ci_ben()
--- a/tests/benchmark/calculate_score.py
+++ b/tests/benchmark/calculate_score.py
-"""
-calculate_score
-"""
-import os
-import re
-import json
-import scoring
-from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
-from nltk.tokenize import word_tokenize
-from Levenshtein import distance
-
-class Scoring:
-    """
-    calculate_score 
-    """
-    def __init__(self, result_path):
-        """
-        init
-        """
-        self.edit_distances = []
-        self.bleu_scores = []
-        self.sim_scores = []
-        self.filenames = []
-        self.score_dict = {}
-        self.anntion_cnt = 0
-        self.fw = open(result_path, "w+", encoding='utf-8')
-
-    def simple_bleu_score(self, candidate, reference):
-        """
-        get bleu score
-        """
-        candidate_tokens = word_tokenize(candidate)
-        reference_tokens = word_tokenize(reference)
-        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1)
-
-
-    def preprocess_string(self, s):
-        """
-        preprocess_string
-        """
-        sub_enter = re.sub(r'\n+', '\n', s)
-        return re.sub(r'  ', ' ', sub_enter)
-    
-    def calculate_similarity(self, annotion, actual, tool_type):
-        """
-        calculate_similarity
-        """
-        class_dict = {}
-        edit_distances = []
-        bleu_scores = []
-        sim_scores = list()
-        total_file = 0
-        for filename in os.listdir(annotion):
-            if filename.endswith('.md') and not filename.startswith('.'):
-                total_file = total_file + 1
-                with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:
-                    content_a = file_a.read()
-                self.anntion_cnt = self.anntion_cnt + 1
-                filepath_b = os.path.join(actual, filename)
-                if os.path.exists(filepath_b):
-                    with open(filepath_b, 'r', encoding='utf-8') as file_b:
-                        content_b = file_b.read()
-                        self.filenames.append(filename)
-                        edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
-                        self.edit_distances.append(edit_dist)
-                        edit_distances.append(edit_dist)
-                        bleu_score = self.simple_bleu_score(content_b, content_a)
-                        bleu_scores.append(bleu_score)
-                        self.bleu_scores.append(bleu_score)
-                        score = scoring.score_text(content_b, content_a)
-                        sim_scores.append(score)
-                        self.sim_scores.append(score)
-                        class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
-                        self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
-                else:  
-                    print(f"File {filename} not found in actual directory.")
-        class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0
-        class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
-        class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
-        self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
-        ratio = len(class_dict)/total_file
-        self.fw.write(f"{tool_type} extract ratio:  {ratio}" + "\n")
-        self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
-        self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
-        self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
-        print (f"{tool_type} extract ratio: {ratio}")
-        print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
-        print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
-        print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
-        return self.score_dict
-    
-    def summary_scores(self):
-        """
-        calculate the average of edit distance, bleu score and sim score
-        """
-        over_all_dict = dict()
-        average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0  
-        average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0  
-        average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
-        over_all_dict["average_edit_distance"] = average_edit_distance
-        over_all_dict["average_bleu_score"] = average_bleu_score
-        over_all_dict["average_sim_score"] = average_sim_score
-        self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
-        return over_all_dict
-
-    def calculate_similarity_total(self, tool_type, download_dir):
-        """
-        calculate the average of edit distance, bleu score and sim score
-        """
-        annotion = os.path.join(download_dir, "annotations", "cleaned")
-        actual = os.path.join(download_dir, tool_type, "cleaned")
-        score = self.calculate_similarity(annotion, actual, tool_type)
-        return score
-
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_0b2c9c91f5232541a7ace8984df306b2.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_0b2c9c91f5232541a7ace8984df306b2.md
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_f7904bc37cc2e25c1e3e412978854b10.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_f7904bc37cc2e25c1e3e412978854b10.md
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_fbdb99151e811688574c0c4c67341074.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_fbdb99151e811688574c0c4c67341074.md
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_ordinary_textbook_1d9a847603a5e37e379738316820850d.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_ordinary_textbook_1d9a847603a5e37e379738316820850d.md
-# 数学新星问题征解 
-
-第十五期 (2016.06)
-
-主持: 牟晓生
-
-第一题. 设 $z_{1}, z_{2}, z_{3}$ 是单位复数. 证明存在单位复数 $z$ 使得:
-
-$$
-\frac{1}{\left|z-z_{1}\right|^{2}}+\frac{1}{\left|z-z_{2}\right|^{2}}+\frac{1}{\left|z-z_{3}\right|^{2}} \leq \frac{9}{4}
-$$
-
-(湖北武钢三中学生 王逸轩, 上海大学冷岗松 供题)
-
-第二题. 如图, $D$ 是正三角形 $A B C$ 的边 $B C$ 上一点, $B D>C D$. 记 $O_{1}, I_{1}$ 为 $\triangle A B D$ 的外心与内心, $O_{2}, I_{2}$ 为 $\triangle A C D$ 的外心与内心. 圆 $I_{1}$ 与圆 $I_{2}$ 除 $B C$外的另一条外公切线交 $A B, A C$ 于 $P, Q$. 设直线 $P I_{1}$与 $Q I_{2}$ 交于 $R$, 而直线 $O_{1} I_{1}$ 与 $O_{2} I_{2}$ 交于 $T$. 证明: $A T^{2}=A R^{2}+A D \cdot B C$.
-
-(广西钦州 卢圣 供题)
-
-
-
-第三题. 给定正整数 $m, n$, 考虑在 $m \times n$ 白棋盘上先将一些格染成黑色. 在之后的每一时刻, 若存在一个白格至少与两个黑格相邻, 则可将它也染成黑色. 求最初至少要染多少个黑色格才能在某一时刻染黑整个棋盘?
-
-(哈佛大学 牟晓生 供题)
-
-第四题. $A B C$ 是一个三角形, 而 $P, Q, R$ 分别是 $B C, C A, A B$ 上的点。证明 $\triangle P Q R$ 的周长不小于 $\triangle A Q R, \triangle B R P, \triangle C P Q$ 周长的最小值.
-
-(哈佛大学 牟晓生 供题)
-
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_research_report_1f978cd81fb7260c8f7644039ec2c054.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_research_report_1f978cd81fb7260c8f7644039ec2c054.md
--- a/tests/benchmark/datasets/pdf/academic_literature_0b2c9c91f5232541a7ace8984df306b2.pdf
+++ b/tests/benchmark/datasets/pdf/academic_literature_0b2c9c91f5232541a7ace8984df306b2.pdf
--- a/tests/benchmark/datasets/pdf/academic_literature_f7904bc37cc2e25c1e3e412978854b10.pdf
+++ b/tests/benchmark/datasets/pdf/academic_literature_f7904bc37cc2e25c1e3e412978854b10.pdf
--- a/tests/benchmark/datasets/pdf/academic_literature_fbdb99151e811688574c0c4c67341074.pdf
+++ b/tests/benchmark/datasets/pdf/academic_literature_fbdb99151e811688574c0c4c67341074.pdf
--- a/tests/benchmark/datasets/pdf/ordinary_textbook_1d9a847603a5e37e379738316820850d.pdf
+++ b/tests/benchmark/datasets/pdf/ordinary_textbook_1d9a847603a5e37e379738316820850d.pdf
--- a/tests/benchmark/datasets/pdf/research_report_1f978cd81fb7260c8f7644039ec2c054.pdf
+++ b/tests/benchmark/datasets/pdf/research_report_1f978cd81fb7260c8f7644039ec2c054.pdf
--- a/tests/benchmark/env.sh
+++ b/tests/benchmark/env.sh
-conda create -n MinerU python=3.10
-conda activate MinerU
-pip install magic-pdf
-#cp magic-pdf.template.json ~/magic-pdf.json
\ No newline at end of file
--- a/tests/benchmark/magic-pdf.json
+++ b/tests/benchmark/magic-pdf.json
-{
-    "bucket_info":{
-        "bucket-name-1":["ak", "sk", "endpoint"],
-        "bucket-name-2":["ak", "sk", "endpoint"]
-    },
-    "temp-output-dir":"/tmp",
-    "models-dir":"/tmp/models",
-    "device-mode":"cpu"
-}
\ No newline at end of file
--- a/tests/benchmark/pre_clean.py
+++ b/tests/benchmark/pre_clean.py
-"""
-clean data
-"""
-import argparse
-import os
-import re
-import htmltabletomd # type: ignore
-import pypandoc
-import argparse
-
-parser = argparse.ArgumentParser(description="get tool type")
-parser.add_argument(
-    "--tool_name",
-    type=str,
-    required=True,
-    help="input tool name",
-)
-parser.add_argument(
-    "--download_dir",
-    type=str,
-    required=True,
-    help="input download dir",
-)
-args = parser.parse_args()
-
-def clean_markdown_images(content):
-    """
-    clean markdown images
-    """
-    pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)  
-    cleaned_content = pattern.sub('', content)   
-    return cleaned_content
-   
-def clean_ocrmath_photo(content):
-    """
-    clean ocrmath photo
-    """
-    pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)  
-    cleaned_content = pattern.sub('', content)   
-    return cleaned_content
-
-def convert_html_table_to_md(html_table):
-    """
-    convert html table to markdown table
-    """
-    lines = html_table.strip().split('\n')  
-    md_table = ''  
-    if lines and '<tr>' in lines[0]:  
-        in_thead = True  
-        for line in lines:  
-            if '<th>' in line:  
-                cells = re.findall(r'<th>(.*?)</th>', line)  
-                md_table += '| ' + ' | '.join(cells) + ' |\n'  
-                in_thead = False  
-            elif '<td>' in line and not in_thead:  
-                cells = re.findall(r'<td>(.*?)</td>', line)  
-                md_table += '| ' + ' | '.join(cells) + ' |\n'  
-        md_table = md_table.rstrip() + '\n'    
-    return md_table  
- 
-def convert_latext_to_md(content):
-    """
-    convert latex table to markdown table
-    """
-    tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)  
-    placeholders = []  
-    for table in tables:  
-        placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
-        replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
-        content = content.replace(replace_str, placeholder)  
-        try:
-            pypandoc.convert_text(replace_str,  format="latex", to="md", outputfile="output.md", encoding="utf-8")
-        except:
-            markdown_string = replace_str
-        else: 
-            markdown_string = open('output.md', 'r', encoding='utf-8').read()
-        placeholders.append((placeholder, markdown_string)) 
-    new_content = content  
-    for placeholder, md_table in placeholders:  
-        new_content = new_content.replace(placeholder, md_table)  
-        # 写入文件  
-    return new_content
-
- 
-def convert_htmltale_to_md(content):
-    """
-    convert html table to markdown table
-    """
-    tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)  
-    placeholders = []
-    for table in tables:
-        placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
-        content = content.replace(f"<table>{table}</table>", placeholder)  
-        try:
-            convert_table = htmltabletomd.convert_table(table)
-        except:
-            convert_table = table
-        placeholders.append((placeholder,convert_table)) 
-    new_content = content  
-    for placeholder, md_table in placeholders:  
-        new_content = new_content.replace(placeholder, md_table)  
-        # 写入文件  
-    return new_content
-
-def clean_data(prod_type, download_dir):
-    """
-    clean data
-    """
-    tgt_dir = os.path.join(download_dir, prod_type, "cleaned")
-    if not os.path.exists(tgt_dir):  
-        os.makedirs(tgt_dir) 
-    source_dir = os.path.join(download_dir, prod_type)
-    filenames = os.listdir(source_dir)
-    for filename in filenames:
-        if filename.endswith('.md'):
-            input_file = os.path.join(source_dir, filename)
-            output_file = os.path.join(tgt_dir, "cleaned_" + filename)
-            with open(input_file, 'r', encoding='utf-8') as fr:
-                content = fr.read()
-                new_content = clean_markdown_images(content)
-                new_content = convert_html_table_to_md(new_content)
-                new_content = convert_latext_to_md(new_content)
-                new_content = convert_htmltale_to_md(new_content)
-                with open(output_file, 'w', encoding='utf-8') as fw:
-                    fw.write(new_content)
-
-
-if __name__ == '__main__':
-    tool_type = args.tool_name
-    download_dir = args.download_dir
-    clean_data(tool_type, download_dir)
--- a/tests/benchmark/result.json
+++ b/tests/benchmark/result.json
-{"average_sim_score":0, "average_edit_distance":0, "average_bleu_score": 0}
\ No newline at end of file
--- a/tests/benchmark/scoring.py
+++ b/tests/benchmark/scoring.py
-import math
-
-from rapidfuzz import fuzz
-import re
-import regex
-from statistics import mean
-
-CHUNK_MIN_CHARS = 25
-
-def chunk_text(text, chunk_len=500):
-    chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
-    chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
-    return chunks
-
-
-def overlap_score(hypothesis_chunks, reference_chunks):
-    if len(reference_chunks) > 0:
-        length_modifier = len(hypothesis_chunks) / len(reference_chunks)
-    else:
-        length_modifier = 0
-    search_distance = max(len(reference_chunks) // 5, 10)
-    chunk_scores = []
-    for i, hyp_chunk in enumerate(hypothesis_chunks):
-        max_score = 0
-        total_len = 0
-        i_offset = int(i * length_modifier)
-        chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
-        for j in chunk_range:
-            ref_chunk = reference_chunks[j]
-            score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
-            if score > max_score:
-                max_score = score
-                total_len = len(ref_chunk)
-        chunk_scores.append(max_score)
-    return chunk_scores
-
-
-def score_text(hypothesis, reference):
-    # Returns a 0-1 alignment score
-    hypothesis_chunks = chunk_text(hypothesis)
-    reference_chunks = chunk_text(reference)
-    chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
-    if len(chunk_scores) > 0:
-        mean_score = mean(chunk_scores)
-        return mean_score
-    else:
-        return 0
-    #return mean(chunk_scores)
\ No newline at end of file
--- a/tests/test_cli/conf/conf.py
+++ b/tests/test_cli/conf/conf.py
@@ -2,6 +2,6 @@ import os
 conf = {
 "code_path": os.environ.get('GITHUB_WORKSPACE'),
 "pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
-"pdf_res_path": "/tmp"
+"pdf_res_path": "/tmp/magic-pdf"
 }

--- a/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.json
+++ b/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.json
--- a/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.pdf
+++ b/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.pdf
--- a/tests/test_cli/pdf_dev/2365839d-4116-45de-b2f0-3a740e1d6c20.html.json
+++ b/tests/test_cli/pdf_dev/2365839d-4116-45de-b2f0-3a740e1d6c20.html.json
--- a/tests/test_cli/pdf_dev/2365839d-4116-45de-b2f0-3a740e1d6c20.html.pdf
+++ b/tests/test_cli/pdf_dev/2365839d-4116-45de-b2f0-3a740e1d6c20.html.pdf
--- a/tests/test_cli/pdf_dev/24cb61a0-cace-460a-a42b-495a86caf88f.html.json
+++ b/tests/test_cli/pdf_dev/24cb61a0-cace-460a-a42b-495a86caf88f.html.json
--- a/tests/test_cli/pdf_dev/24cb61a0-cace-460a-a42b-495a86caf88f.html.pdf
+++ b/tests/test_cli/pdf_dev/24cb61a0-cace-460a-a42b-495a86caf88f.html.pdf
--- a/tests/test_cli/pdf_dev/300970fd-b34a-4656-a334-23059595b360.html.json
+++ b/tests/test_cli/pdf_dev/300970fd-b34a-4656-a334-23059595b360.html.json
--- a/tests/test_cli/pdf_dev/300970fd-b34a-4656-a334-23059595b360.html.pdf
+++ b/tests/test_cli/pdf_dev/300970fd-b34a-4656-a334-23059595b360.html.pdf
--- a/tests/test_cli/pdf_dev/40c595b5-3b62-4021-b8dd-5e445d223c47.html.json
+++ b/tests/test_cli/pdf_dev/40c595b5-3b62-4021-b8dd-5e445d223c47.html.json
--- a/tests/test_cli/pdf_dev/40c595b5-3b62-4021-b8dd-5e445d223c47.html.pdf
+++ b/tests/test_cli/pdf_dev/40c595b5-3b62-4021-b8dd-5e445d223c47.html.pdf
--- a/tests/test_cli/pdf_dev/416b8524-9a6f-4b49-b7d4-56ce5c825699.html.json
+++ b/tests/test_cli/pdf_dev/416b8524-9a6f-4b49-b7d4-56ce5c825699.html.json
--- a/tests/test_cli/pdf_dev/416b8524-9a6f-4b49-b7d4-56ce5c825699.html.pdf
+++ b/tests/test_cli/pdf_dev/416b8524-9a6f-4b49-b7d4-56ce5c825699.html.pdf
--- a/tests/test_cli/pdf_dev/658cbc48-9edd-4537-8b02-261c052a2845.html.json
+++ b/tests/test_cli/pdf_dev/658cbc48-9edd-4537-8b02-261c052a2845.html.json
--- a/tests/test_cli/pdf_dev/658cbc48-9edd-4537-8b02-261c052a2845.html.pdf
+++ b/tests/test_cli/pdf_dev/658cbc48-9edd-4537-8b02-261c052a2845.html.pdf
--- a/tests/test_cli/pdf_dev/789b3b75-b5ad-49c2-8ba1-e8719f7a1d42.html.json
+++ b/tests/test_cli/pdf_dev/789b3b75-b5ad-49c2-8ba1-e8719f7a1d42.html.json
--- a/tests/test_cli/pdf_dev/789b3b75-b5ad-49c2-8ba1-e8719f7a1d42.html.pdf
+++ b/tests/test_cli/pdf_dev/789b3b75-b5ad-49c2-8ba1-e8719f7a1d42.html.pdf
--- a/tests/test_cli/pdf_dev/9eb3c6a7-1564-4a10-8cfb-56c628e46208.html.json
+++ b/tests/test_cli/pdf_dev/9eb3c6a7-1564-4a10-8cfb-56c628e46208.html.json
--- a/tests/test_cli/pdf_dev/9eb3c6a7-1564-4a10-8cfb-56c628e46208.html.pdf
+++ b/tests/test_cli/pdf_dev/9eb3c6a7-1564-4a10-8cfb-56c628e46208.html.pdf
--- a/tests/test_cli/pdf_dev/b80cbc13-6655-42a8-a3a1-fe2db6eff883.html.json
+++ b/tests/test_cli/pdf_dev/b80cbc13-6655-42a8-a3a1-fe2db6eff883.html.json
--- a/tests/test_cli/pdf_dev/b80cbc13-6655-42a8-a3a1-fe2db6eff883.html.pdf
+++ b/tests/test_cli/pdf_dev/b80cbc13-6655-42a8-a3a1-fe2db6eff883.html.pdf
--- a/tests/test_cli/pdf_dev/bb72581d-bcbd-419c-ba55-a26af7c7f00d.html.json
+++ b/tests/test_cli/pdf_dev/bb72581d-bcbd-419c-ba55-a26af7c7f00d.html.json
--- a/tests/test_cli/pdf_dev/bb72581d-bcbd-419c-ba55-a26af7c7f00d.html.pdf
+++ b/tests/test_cli/pdf_dev/bb72581d-bcbd-419c-ba55-a26af7c7f00d.html.pdf
--- a/tests/test_cli/pdf_dev/ef36fc6f-d521-49b6-9846-85e565404632.html.json
+++ b/tests/test_cli/pdf_dev/ef36fc6f-d521-49b6-9846-85e565404632.html.json
--- a/tests/test_cli/pdf_dev/ef36fc6f-d521-49b6-9846-85e565404632.html.pdf
+++ b/tests/test_cli/pdf_dev/ef36fc6f-d521-49b6-9846-85e565404632.html.pdf
--- a/tests/test_cli/pdf_dev/p3_图文混排84.json
+++ b/tests/test_cli/pdf_dev/p3_图文混排84.json
--- a/tests/test_cli/pdf_dev/p3_图文混排84.pdf
+++ b/tests/test_cli/pdf_dev/p3_图文混排84.pdf
--- a/tests/test_cli/test_cli.py
+++ b/tests/test_cli/test_cli.py
@@ -6,35 +6,27 @@ from lib import common
 import logging
 import os
 import json
-
 from loguru import logger
-
 from magic_pdf.pipe.UNIPipe import UNIPipe
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 pdf_res_path = conf.conf["pdf_res_path"]
 code_path = conf.conf["code_path"]
 pdf_dev_path = conf.conf["pdf_dev_path"]
 class TestCli:
-   
-    def test_pdf_specify_dir(self):
    """
-        输入pdf和指定目录的模型结果
+    test cli
    """
-        cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py  pdf-command  --pdf {}' % (code_path, pdf_dev_path)
-        logging.info(cmd)
-        common.check_shell(cmd)
-        #common.count_folders_and_check_contents(pdf_res_path)      
-   
    def test_pdf_sdk(self):
        """
        pdf sdk 方式解析
        """
        demo_names = list()
-        for pdf_file in os.listdir(pdf_dev_path):
+        pdf_path = os.path.join(pdf_dev_path, "pdf")
+        for pdf_file in os.listdir(pdf_path):
            if pdf_file.endswith('.pdf'):
                demo_names.append(pdf_file.split('.')[0])
        for demo_name in demo_names:
-            model_path = os.path.join(pdf_dev_path, f"{demo_name}.json")
+            model_path = os.path.join(pdf_dev_path, f"{demo_name}_model.json")
            pdf_path = os.path.join(pdf_dev_path, f"{demo_name}.pdf")
            pdf_bytes = open(pdf_path, "rb").read()
            model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
@@ -45,9 +37,11 @@ class TestCli:
            pipe.pipe_classify()
            pipe.pipe_parse()
            md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
-            with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
+            res_path = os.path.join(pdf_dev_path, "miner", f"{demo_name}.md")
+            with open(res_path, "w", encoding="utf-8") as f:
                f.write(md_content)

+        
    # def test_pdf_specify_jsonl(self):
    #     """
    #     输入jsonl, 默认方式解析