Merge remote-tracking branch 'origin/master'

b708d719 · 赵小蒙 · bf45c8fb · fdb6a2e1 · b708d719 · b708d719
Commit b708d719 authored Jun 26, 2024 by 赵小蒙
20 changed files
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -19,7 +19,7 @@ on:
 jobs:
  pdf-test:
    runs-on: pdf
-    timeout-minutes: 40
+    timeout-minutes: 180
    strategy:
      fail-fast: true
@@ -47,8 +47,7 @@ jobs:
    - name: get-benchmark-result
      run: |
        echo "start test"
-        cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_overall base_data_text.json --badcase_path  text_badcase --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK  --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
+        cd tools && python benchmark.py
-        python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_overall base_data_ocr.json --badcase_path ocr_badcase --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK  --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
  notify_to_feishu:
    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}

--- a/tests/test_cli/test_cli.py
+++ b/tests/test_cli/test_cli.py
@@ -16,7 +16,7 @@ class TestCli:
        cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py  pdf-command  --pdf {}' % (code_path, pdf_dev_path)
        logging.info(cmd)
        common.check_shell(cmd)
-        common.count_folders_and_check_contents(pdf_res_path)      
+        #common.count_folders_and_check_contents(pdf_res_path)      
    def test_pdf_specify_jsonl(self):
@@ -26,7 +26,7 @@ class TestCli:
        cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py  json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972'" % (code_path)
        logging.info(cmd)
        common.check_shell(cmd)
-        common.count_folders_and_check_contents(pdf_res_path)
+        #common.count_folders_and_check_contents(pdf_res_path)
    def test_pdf_specify_jsonl_txt(self):
        """
@@ -35,7 +35,7 @@ class TestCli:
        cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py  json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method txt" % (code_path)
        logging.info(cmd)
        common.check_shell(cmd)
-        common.count_folders_and_check_contents(pdf_res_path)
+        #common.count_folders_and_check_contents(pdf_res_path)
    def test_pdf_specify_jsonl_ocr(self):
        """
@@ -44,7 +44,7 @@ class TestCli:
        cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py  json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method ocr" % (code_path)
        logging.info(cmd)
        common.check_shell(cmd)
-        common.count_folders_and_check_contents(pdf_res_path)
+        #common.count_folders_and_check_contents(pdf_res_path)
 if __name__ == "__main__":

--- a/tools/README.MD
+++ b/tools/README.MD
-# 工具脚本使用说明
-### OCR Badcase Commands
- **Command without badcase output:**
-  `python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_overall base_data_ocr.json`
- **Command with badcase output:**
-  `python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_overall base_data_ocr.json --badcase_path ocr_badcase`
-### Text Badcase Commands
- **Command without badcase output:**
-    `python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_overall base_data_text.json`
- **Command with badcase output:**
-    ` python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_overall base_data_text.json --badcase_path text_badcase`
- **Command with upload to s3:**
-  -  add the following arguments to the command 
-        `--s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY Your AK  --AWS_SECRET_KEY Your SK --END_POINT_URL Your Endpoint ` 
--- a/tools/base_data_ocr.json
+++ b/tools/base_data_ocr.json
-{
-    "accuracy": 1.0,
-    "precision": 1.0,
-    "recall": 1.0,
-    "f1_score": 1.0,
-    "pdf间的平均编辑距离": 133.10256410256412,
-    "pdf间的平均bleu": 0.28838311595434046,
-    "分段准确率": 0.07220216606498195,
-    "行内公式准确率": {
-        "accuracy": 0.004835727492533068,
-        "precision": 0.008790072388831437,
-        "recall": 0.010634970284641852,
-        "f1_score": 0.009624911535739562
-    },
-    "行内公式编辑距离": 1.6176470588235294,
-    "行内公式bleu": 0.17154724654721457,
-    "行间公式准确率": {
-        "accuracy": 0.08490566037735849,
-        "precision": 0.1836734693877551,
-        "recall": 0.13636363636363635,
-        "f1_score": 0.1565217391304348
-    },
-    "行间公式编辑距离": 113.22222222222223,
-    "行间公式bleu": 0.2531053359913409,
-    "丢弃文本准确率": {
-        "accuracy": 0.00035398230088495576,
-        "precision": 0.0006389776357827476,
-        "recall": 0.0007930214115781126,
-        "f1_score": 0.0007077140835102619
-    },
-    "丢弃文本标签准确率": {
-        "color_background_header_txt_block": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 41.0
-        },
-        "header": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 4.0
-        },
-        "footnote": {
-            "precision": 1.0,
-            "recall": 0.009708737864077669,
-            "f1-score": 0.019230769230769232,
-            "support": 103.0
-        },
-        "on-table": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 665.0
-        },
-        "rotate": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 63.0
-        },
-        "on-image": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 380.0
-        },
-        "micro avg": {
-            "precision": 1.0,
-            "recall": 0.0007961783439490446,
-            "f1-score": 0.0015910898965791568,
-            "support": 1256.0
-        }
-    },
-    "丢弃图片准确率": {
-        "accuracy": 0.0,
-        "precision": 0.0,
-        "recall": 0.0,
-        "f1_score": 0.0
-    },
-    "丢弃表格准确率": {
-        "accuracy": 0.0,
-        "precision": 0.0,
-        "recall": 0.0,
-        "f1_score": 0.0
-    }
-}
\ No newline at end of file
--- a/tools/base_data_text.json
+++ b/tools/base_data_text.json
-{
-    "accuracy": 1.0,
-    "precision": 1.0,
-    "recall": 1.0,
-    "f1_score": 1.0,
-    "pdf间的平均编辑距离": 19.82051282051282,
-    "pdf间的平均bleu": 0.9002485609584511,
-    "阅读顺序编辑距离": 0.3176895306859206,
-    "分段准确率": 0.8989169675090253,
-    "行内公式准确率": {
-        "accuracy": 0.9782741738066095,
-        "precision": 0.9782741738066095,
-        "recall": 1.0,
-        "f1_score": 0.9890177880897139
-    },
-    "行内公式编辑距离": 0.0,
-    "行内公式bleu": 0.20340450120213166,
-    "行间公式准确率": {
-        "accuracy": 1.0,
-        "precision": 1.0,
-        "recall": 1.0,
-        "f1_score": 1.0
-    },
-    "行间公式编辑距离": 0.0,
-    "行间公式bleu": 0.3662262622386575,
-    "丢弃文本准确率": {
-        "accuracy": 0.867870036101083,
-        "precision": 0.9064856711915535,
-        "recall": 0.9532117367168914,
-        "f1_score": 0.9292616930807885
-    },
-    "丢弃文本标签准确率": {
-        "color_background_header_txt_block": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 41.0
-        },
-        "rotate": {
-            "precision": 1.0,
-            "recall": 0.9682539682539683,
-            "f1-score": 0.9838709677419355,
-            "support": 63.0
-        },
-        "footnote": {
-            "precision": 1.0,
-            "recall": 0.883495145631068,
-            "f1-score": 0.9381443298969072,
-            "support": 103.0
-        },
-        "header": {
-            "precision": 1.0,
-            "recall": 1.0,
-            "f1-score": 1.0,
-            "support": 4.0
-        },
-        "on-image": {
-            "precision": 0.9947643979057592,
-            "recall": 1.0,
-            "f1-score": 0.9973753280839895,
-            "support": 380.0
-        },
-        "on-table": {
-            "precision": 1.0,
-            "recall": 0.9443609022556391,
-            "f1-score": 0.97138437741686,
-            "support": 665.0
-        },
-        "micro avg": {
-            "precision": 0.9982847341337907,
-            "recall": 0.9267515923566879,
-            "f1-score": 0.9611890999174236,
-            "support": 1256.0
-        }
-    },
-    "丢弃图片准确率": {
-        "accuracy": 0.8666666666666667,
-        "precision": 0.9285714285714286,
-        "recall": 0.9285714285714286,
-        "f1_score": 0.9285714285714286
-    },
-    "丢弃表格准确率": {
-        "accuracy": 0,
-        "precision": 0,
-        "recall": 0,
-        "f1_score": 0
-    }
-}
\ No newline at end of file
--- a/tools/benchmark.py
+++ b/tools/benchmark.py
+import zipfile
+import os
+import shutil
+code_path = os.environ.get('GITHUB_WORKSPACE')
+pdf_dev_path = "/home/quyuan/data"
+pdf_res_path = "/home/quyuan/code/Magic-PDF/Magic-PDF/Magic-PDF/ci/magic-pdf"
+def test_cli():
+    cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py  pdf-command  --pdf {}' % (code_path, pdf_dev_path)
+    os.system(cmd)
+    if not os.path.exists(os.path.join(pdf_dev_path, "output")):
+        os.makedirs(os.path.join(pdf_dev_path, "output"))
+    for annotaion_name in os.listdir(os.path.join(pdf_dev_path, "output")):
+        if annotaion_name.endswith('.pdf'):
+            for pdf_res_path  in os.listdir(pdf_res_path):
+                if ".md" in os.path.join(pdf_res_path, annotaion_name, "auto"):
+                    prefix = annotaion_name.split('_')[-2]
+                    if not os.path.exists(os.join(pdf_dev_path, prefix)):
+                        os.makedirs(os.path.join(pdf_dev_path, prefix))
+                        shutil.copy(os.path.join(pdf_res_path, annotaion_name, "auto", annotaion_name + ".md"), os.join(pdf_dev_path, prefix, annotaion_name + ".md"))
+def calculate_score():
+    cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name annotations --download_dir %s" % (code_path, pdf_dev_path)
+    os.system(cmd)
+    cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s" % (code_path, pdf_dev_path)
+    os.system(cmd)
+    cmd = "cd %s && export PYTHONPATH=. && python tools/markdown_calculate.py --tool_name pdf-command --download_dir %s --results %s" % (code_path, pdf_dev_path, os.path.join(pdf_dev_path, "result.json"))
+    os.system(cmd)
+def extrat_zip(zip_file_path, extract_to_path):
+    if zipfile.is_zipfile(zip_file_path):
+        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_to_path)
+        print(f'Files extracted to {extract_to_path}')
+    else:
+        print(f'{zip_file_path} is not a zip file')
+if __name__ == "__main__":
+    extrat_zip(os.path.join(pdf_dev_path, 'output.zip'), os.path.join(pdf_dev_path,'datasets'))
+    test_cli()
+    calculate_score()
--- a/tools/clean_photo.py
+++ b/tools/clean_photo.py
+import pypandoc
+import re  
+import htmltabletomd
+import os  
+import argparse
+import zipfile
+parser = argparse.ArgumentParser(description="get tool type")
+parser.add_argument(
+    "--tool_name",
+    type=str,
+    required=True,
+    help="input tool name",
+)
+parser.add_argument(
+    "--download_dir",
+    type=str,
+    required=True,
+    help="input download dir",
+)
+args = parser.parse_args()
+def clean_markdown_images(content):  
+    pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)  
+    cleaned_content = pattern.sub('', content)   
+    return cleaned_content
+def clean_ocrmath_photo(content):
+    pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)  
+    cleaned_content = pattern.sub('', content)   
+    return cleaned_content
+def convert_html_table_to_md(html_table):  
+    lines = html_table.strip().split('\n')  
+    md_table = ''  
+    if lines and '<tr>' in lines[0]:  
+        in_thead = True  
+        for line in lines:  
+            if '<th>' in line:  
+                cells = re.findall(r'<th>(.*?)</th>', line)  
+                md_table += '| ' + ' | '.join(cells) + ' |\n'  
+                in_thead = False  
+            elif '<td>' in line and not in_thead:  
+                cells = re.findall(r'<td>(.*?)</td>', line)  
+                md_table += '| ' + ' | '.join(cells) + ' |\n'  
+        md_table = md_table.rstrip() + '\n'    
+    return md_table  
+def convert_latext_to_md(content):  
+    tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)  
+    placeholders = []  
+    for table in tables:  
+        placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
+        replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
+        content = content.replace(replace_str, placeholder)  
+        try:
+            pypandoc.convert_text(replace_str,  format="latex", to="md", outputfile="output.md", encoding="utf-8")
+        except:
+            markdown_string = replace_str
+        else: 
+            markdown_string = open('output.md', 'r', encoding='utf-8').read()
+        placeholders.append((placeholder, markdown_string)) 
+    new_content = content  
+    for placeholder, md_table in placeholders:  
+        new_content = new_content.replace(placeholder, md_table)  
+        # 写入文件  
+    return new_content
+def convert_htmltale_to_md(content):  
+    tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)  
+    placeholders = []  
+    for table in tables:  
+        placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
+        content = content.replace(f"<table>{table}</table>", placeholder)  
+        try:
+            convert_table = htmltabletomd.convert_table(table)
+        except:
+            convert_table = table
+        placeholders.append((placeholder,convert_table)) 
+    new_content = content  
+    for placeholder, md_table in placeholders:  
+        new_content = new_content.replace(placeholder, md_table)  
+        # 写入文件  
+    return new_content
+def clean_data(prod_type, download_dir):
+    file_type = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
+    for filetype in file_type:
+        tgt_dir = os.path.join(download_dir, filetype, prod_type, "cleaned")
+        if not os.path.exists(tgt_dir):  
+            os.makedirs(tgt_dir) 
+        source_dir = os.path.join(download_dir, filetype, prod_type)
+        filenames = os.listdir(source_dir)
+        for filename in filenames:
+            if filename.endswith('.md'):
+                input_file = os.path.join(source_dir, filename)
+                output_file = os.path.join(tgt_dir, "cleaned_" + filename)
+                with open(input_file, 'r', encoding='utf-8') as fr:
+                    content = fr.read()
+                    new_content = convert_htmltale_to_md(content)
+                    new_content = clean_markdown_images(new_content)
+                    new_content = clean_ocrmath_photo(new_content)
+                    new_content = convert_latext_to_md(new_content)
+                    with open(output_file, 'w', encoding='utf-8') as fw:
+                        fw.write(new_content)
+if __name__ == '__main__':
+    tool_type = args.tool_name
+    download_dir = args.download_dir
+    clean_data(tool_type, download_dir)
--- a/tools/config_init_to_json.py
+++ b/tools/config_init_to_json.py
-from loguru import logger
-import json
-import os
-from magic_pdf.config import s3_buckets, s3_clusters, s3_users
-def get_bucket_configs_dict(buckets, clusters, users):
-    bucket_configs = {}
-    for s3_bucket in buckets.items():
-        bucket_name = s3_bucket[0]
-        bucket_config = s3_bucket[1]
-        cluster, user = bucket_config
-        cluster_config = clusters[cluster]
-        endpoint_key = "outside"
-        endpoints = cluster_config[endpoint_key]
-        endpoint = endpoints[0]
-        user_config = users[user]
-        # logger.info(bucket_name)
-        # logger.info(endpoint)
-        # logger.info(user_config)
-        bucket_config = [user_config["ak"], user_config["sk"], endpoint]
-        bucket_configs[bucket_name] = bucket_config
-    return bucket_configs
-def write_json_to_home(my_dict):
-    # Convert dictionary to JSON
-    json_data = json.dumps(my_dict, indent=4, ensure_ascii=False)
-    home_dir = os.path.expanduser("~")
-    # Define the output file path
-    output_file = os.path.join(home_dir, "magic-pdf.json")
-    # Write JSON data to the output file
-    with open(output_file, "w") as f:
-        f.write(json_data)
-    # Print a success message
-    print(f"Dictionary converted to JSON and saved to {output_file}")
-if __name__ == '__main__':
-    bucket_configs_dict = get_bucket_configs_dict(s3_buckets, s3_clusters, s3_users)
-    logger.info(bucket_configs_dict)
-    config_dict = {
-        "bucket_info": bucket_configs_dict,
-        "temp-output-dir": "/tmp"
-    }
-    write_json_to_home(config_dict)
--- a/tools/json_files.zip
+++ b/tools/json_files.zip
--- a/tools/json_files/ocr_dataset.json
+++ b/tools/json_files/ocr_dataset.json
--- a/tools/json_files/pdf_json_label_0229.json
+++ b/tools/json_files/pdf_json_label_0229.json
--- a/tools/json_files/pdf_json_label_0306.json
+++ b/tools/json_files/pdf_json_label_0306.json
--- a/tools/markdown_calculate.py
+++ b/tools/markdown_calculate.py
+import os  
+from Levenshtein import distance  
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
+from nltk.tokenize import word_tokenize  
+import json 
+import re
+import scoring
+import argparse
+parser = argparse.ArgumentParser(description="get directory")
+parser.add_argument('--document_types', 
+    nargs='+',
+    choices=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"], 
+    help='Choose one or more document_types',
+    default=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
+)
+parser.add_argument(
+    "--tool_name",
+    type=str,
+    required=True,
+    help="tool name",
+)
+parser.add_argument(
+    "--download_dir",
+    type=str,
+    required=True,
+    help="input download dir",
+)
+parser.add_argument(
+    "--results",
+    type=str,
+    required=True,
+    help="results path(end with .json)",
+)
+args = parser.parse_args()
+fw = open(args.results, 'w+', encoding='utf-8')
+# 初始化列表来存储编辑距离和BLEU分数  
+class Scoring:
+    def __init__(self):
+        self.edit_distances = []
+        self.bleu_scores = []
+        self.sim_scores = []
+        self.filenames = []
+        self.score_dict = {}
+        self.anntion_cnt = 0
+    def simple_bleu_score(self, candidate, reference):  
+        candidate_tokens = word_tokenize(candidate)  
+        reference_tokens = word_tokenize(reference) 
+        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1) 
+    def preprocess_string(self, s):  
+        sub_enter = re.sub(r'\n+', '\n', s)
+        return re.sub(r'  ', ' ', sub_enter)
+    def calculate_similarity(self, annotion, actual, tool_type):
+        class_dict = {}
+        edit_distances = []
+        bleu_scores = []
+        sim_scores = list()
+        total_file = 0
+        for filename in os.listdir(annotion):  
+            if filename.endswith('.md') and not filename.startswith('.'):  # 忽略隐藏文件  
+                total_file = total_file + 1
+                # 读取A目录中的文件  
+                with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:  
+                    content_a = file_a.read()
+                self.anntion_cnt = self.anntion_cnt + 1
+                filepath_b = os.path.join(actual, filename)  
+                if os.path.exists(filepath_b):  
+                    with open(filepath_b, 'r', encoding='utf-8') as file_b:  
+                        content_b = file_b.read()
+                        self.filenames.append(filename)
+                        # 计算编辑距离
+                        edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
+                        self.edit_distances.append(edit_dist)  
+                        edit_distances.append(edit_dist)
+                        #计算BLUE分数
+                        bleu_score = self.simple_bleu_score(content_b, content_a)  
+                        bleu_scores.append(bleu_score)
+                        self.bleu_scores.append(bleu_score)  
+                        #计算marker分数
+                        score = scoring.score_text(content_b, content_a)
+                        sim_scores.append(score)
+                        self.sim_scores.append(score)
+                        class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
+                        self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
+                else:  
+                    print(f"File {filename} not found in actual directory.")  
+        # 计算每类平均值
+        class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0  
+        class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0  
+        class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
+        fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
+        ratio = len(class_dict)/total_file
+        fw.write(f"{tool_type} extract ratio:  {ratio}" + "\n")
+        fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
+        fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
+        fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
+        print (f"{tool_type} extract ratio: {ratio}")
+        print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
+        print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
+        print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
+        return self.score_dict
+    def summary_scores(self):
+         # 计算整体平均值
+        average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0  
+        average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0  
+        average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
+        #self.fw.write(json.dumps(self.score_dict, ensure_ascii=False) + "\n")
+        fw.write(f"Overall extract cnt: {len(self.score_dict)/self.anntion_cnt}" + "\n")
+        fw.write(f"Overall Average Levenshtein Distance: {average_edit_distance}" + "\n")
+        fw.write(f"Overall Average BLEU Score: {average_bleu_score}" + "\n")
+        fw.write(f"Overall Average Marker Score: {average_sim_score}" + "\n") 
+        print ("Overall extract ratio: ", len(self.score_dict)/self.anntion_cnt)
+        print (f"Overall Average Levenshtein Distance: {average_edit_distance}")
+        print (f"Overall Average BLEU Score: {average_bleu_score}")
+        print (f"Overall Average Marker Score: {average_sim_score}")
+        fw.close()
+    def calculate_similarity_total(self, tool_type, file_types, download_dir):
+        for file_type in file_types:
+            annotion = os.path.join(download_dir, file_type, "annotations", "cleaned")
+            actual = os.path.join(download_dir, file_type, tool_type, "cleaned")
+            self.calculate_similarity(annotion, actual, file_type)
+if __name__ == "__main__":  
+  file_types = list()
+  tool_type =args.tool_name
+  download_dir = args.download_dir
+  if args.document_types:
+    print("Selected types:", args.document_types)
+    for type_ in args.document_types:
+        file_types.append(type_)
+  else:
+      print("No types selected")
+  print(f"Type {file_types} is selected. Executing related operations...")
+  score = Scoring()
+  score.calculate_similarity_total(tool_type, file_types, download_dir)
+  score.summary_scores()
--- a/tools/ocr_badcase.py
+++ b/tools/ocr_badcase.py
--- a/tools/over_all_benchamark.py
+++ b/tools/over_all_benchamark.py
--- a/tools/pdf_json_label_0229.json
+++ b/tools/pdf_json_label_0229.json
--- a/tools/pdf_json_label_0306.json
+++ b/tools/pdf_json_label_0306.json
--- a/tools/result.txt
+++ b/tools/result.txt
--- a/tools/scoring.py
+++ b/tools/scoring.py
+import math
+from rapidfuzz import fuzz
+import re
+import regex
+from statistics import mean
+CHUNK_MIN_CHARS = 25
+def chunk_text(text, chunk_len=500):
+    chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
+    chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
+    return chunks
+def overlap_score(hypothesis_chunks, reference_chunks):
+    if len(reference_chunks) > 0:
+        length_modifier = len(hypothesis_chunks) / len(reference_chunks)
+    else:
+        length_modifier = 0
+    search_distance = max(len(reference_chunks) // 5, 10)
+    chunk_scores = []
+    for i, hyp_chunk in enumerate(hypothesis_chunks):
+        max_score = 0
+        total_len = 0
+        i_offset = int(i * length_modifier)
+        chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
+        for j in chunk_range:
+            ref_chunk = reference_chunks[j]
+            score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
+            if score > max_score:
+                max_score = score
+                total_len = len(ref_chunk)
+        chunk_scores.append(max_score)
+    return chunk_scores
+def score_text(hypothesis, reference):
+    # Returns a 0-1 alignment score
+    hypothesis_chunks = chunk_text(hypothesis)
+    reference_chunks = chunk_text(reference)
+    chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
+    if len(chunk_scores) > 0:
+        mean_score = mean(chunk_scores)
+        return mean_score
+    else:
+        return 0
+    #return mean(chunk_scores)
\ No newline at end of file
--- a/tools/text_badcase.py
+++ b/tools/text_badcase.py