Merge remote-tracking branch 'origin/master'

34ed90b7 · 赵小蒙 · f84eb897 · d3d7a093 · 34ed90b7 · 34ed90b7
Commit 34ed90b7 authored Jun 28, 2024 by 赵小蒙
5 changed files
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -49,16 +49,3 @@ jobs:
        echo "start test"
        cd tools && python benchmark.py
  
-  notify_to_feishu:
-    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
-    needs: [pdf-test]
-    runs-on: pdf
-    steps:
-    - name: notify
-      run: |
-        curl  ${{ secrets.WEBHOOK_URL }} -H 'Content-Type: application/json'  -d '{
-        "msgtype": "text",
-        "text": {
-            "content": "'${{ github.repository }}' GitHubAction Failed!\n 细节请查看：https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"
-        } 
-        }'                                                                                                                            
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3

 ### Submodule Repositories

- [pdf-extract-kit](https://github.com/wangbinDL/pdf-extract-kit)
+- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
 - [Miner-PDF-Benchmark](https://github.com/opendatalab/Miner-PDF-Benchmark)

 ## Getting Started

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -57,7 +57,7 @@ https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3

 ### 子模块仓库

- [pdf-extract-kit](https://github.com/wangbinDL/pdf-extract-kit)
+- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) 
  领先的文档分析模型
 - [Miner-PDF-Benchmark](https://github.com/opendatalab/Miner-PDF-Benchmark) 
  端到端的PDF文档理解评估套件,专为大规模模型数据场景而设计

--- a/tools/benchmark.py
+++ b/tools/benchmark.py
 import zipfile
 import os
 import shutil
+import json
+import markdown_calculate
 code_path = os.environ.get('GITHUB_WORKSPACE')
 #code_path = "/home/quyuan/actions-runner/_work/Magic-PDF/Magic-PDF.bk"
 #评测集存放路径
@@ -34,8 +36,10 @@ def calculate_score():
    os.system(cmd)
    cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s" % (code_path, data_path)
    os.system(cmd)
-    cmd = "cd %s && export PYTHONPATH=. && python tools/markdown_calculate.py --tool_name magicpdf --download_dir %s --results %s" % (code_path, data_path, os.path.join(data_path, "result.json"))
-    os.system(cmd)
+    score = markdown_calculate.Scoring(os.path.join(data_path, "result.json"))
+    score.calculate_similarity_total("magicpdf", file_types, data_path)
+    res = score.summary_scores()
+    return res


 def extrat_zip(zip_file_path, extract_to_path):
@@ -49,9 +53,24 @@ def extrat_zip(zip_file_path, extract_to_path):

 def ci_ben():
    fr = open(os.path.join(pdf_dev_path, "ci", "result.json"), "r").read()
-
-    
-if __name__ == "__main__":
+    lines = fr.readlines()
+    last_line = lines[-1].strip()
+    last_score = json.loads(last_line)
+    print ("last_score:", last_score)
+    last_simscore = last_score["average_sim_score"]
+    last_editdistance = last_score["average_edit_distance"]
+    last_bleu = last_score["average_bleu_score"]
    extrat_zip(os.path.join(pdf_dev_path, 'output.zip'), os.path.join(pdf_dev_path))
    test_cli()
-    calculate_score()
+    now_score = calculate_score()
+    print ("now_score:", now_score)
+    now_simscore = now_score["average_sim_score"]
+    now_editdistance = now_score["average_edit_distance"]
+    now_bleu = now_score["average_bleu_score"]
+    assert last_simscore <= now_simscore
+    assert last_editdistance <= now_editdistance
+    assert last_bleu <= now_bleu
+
+
+if __name__ == "__main__":
+    ci_ben()
--- a/tools/markdown_calculate.py
+++ b/tools/markdown_calculate.py
@@ -7,44 +7,16 @@ import re
 import scoring
 import argparse

-parser = argparse.ArgumentParser(description="get directory")
-parser.add_argument('--document_types', 
-    nargs='+',
-    choices=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"], 
-    help='Choose one or more document_types',
-    default=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
-)
-
-parser.add_argument(
-    "--tool_name",
-    type=str,
-    required=True,
-    help="tool name",
-)
-parser.add_argument(
-    "--download_dir",
-    type=str,
-    required=True,
-    help="input download dir",
-)
-parser.add_argument(
-    "--results",
-    type=str,
-    required=True,
-    help="results path(end with .json)",
-)
-args = parser.parse_args()
-fw = open(args.results, 'w+', encoding='utf-8')
 # 初始化列表来存储编辑距离和BLEU分数  
 class Scoring:
-    def __init__(self):
+    def __init__(self, result_path):
        self.edit_distances = []
        self.bleu_scores = []
        self.sim_scores = []
        self.filenames = []
        self.score_dict = {}
        self.anntion_cnt = 0
-
+        self.fw = open(result_path, "w+")
    def simple_bleu_score(self, candidate, reference):  
        candidate_tokens = word_tokenize(candidate)  
        reference_tokens = word_tokenize(reference) 
@@ -93,12 +65,12 @@ class Scoring:
        class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0  
        class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0  
        class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
-        fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
+        self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
        ratio = len(class_dict)/total_file
-        fw.write(f"{tool_type} extract ratio:  {ratio}" + "\n")
-        fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
-        fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
-        fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
+        self.fw.write(f"{tool_type} extract ratio:  {ratio}" + "\n")
+        self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
+        self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
+        self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")

        print (f"{tool_type} extract ratio: {ratio}")
        print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
@@ -115,8 +87,8 @@ class Scoring:
        over_all_dict["average_edit_distance"] = average_edit_distance
        over_all_dict["average_bleu_score"] = average_bleu_score
        over_all_dict["average_sim_score"] = average_sim_score
-        fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
-       
+        self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
+        return over_all_dict

    def calculate_similarity_total(self, tool_type, file_types, download_dir):
        for file_type in file_types:
@@ -124,17 +96,3 @@ class Scoring:
            actual = os.path.join(download_dir, file_type, tool_type, "cleaned")
            self.calculate_similarity(annotion, actual, file_type)

-if __name__ == "__main__":  
-  file_types = list()
-  tool_type =args.tool_name
-  download_dir = args.download_dir
-  if args.document_types:
-    print("Selected types:", args.document_types)
-    for type_ in args.document_types:
-        file_types.append(type_)
-  else:
-      print("No types selected")
-  print(f"Type {file_types} is selected. Executing related operations...")
-  score = Scoring()
-  score.calculate_similarity_total(tool_type, file_types, download_dir)
-  score.summary_scores()