Commit 34ed90b7 authored by 赵小蒙's avatar 赵小蒙

Merge remote-tracking branch 'origin/master'

parents f84eb897 d3d7a093
......@@ -49,16 +49,3 @@ jobs:
echo "start test"
cd tools && python benchmark.py
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
needs: [pdf-test]
runs-on: pdf
steps:
- name: notify
run: |
curl ${{ secrets.WEBHOOK_URL }} -H 'Content-Type: application/json' -d '{
"msgtype": "text",
"text": {
"content": "'${{ github.repository }}' GitHubAction Failed!\n 细节请查看:https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"
}
}'
......@@ -48,7 +48,7 @@ https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3
### Submodule Repositories
- [pdf-extract-kit](https://github.com/wangbinDL/pdf-extract-kit)
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
- [Miner-PDF-Benchmark](https://github.com/opendatalab/Miner-PDF-Benchmark)
## Getting Started
......
......@@ -57,7 +57,7 @@ https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3
### 子模块仓库
- [pdf-extract-kit](https://github.com/wangbinDL/pdf-extract-kit)
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
领先的文档分析模型
- [Miner-PDF-Benchmark](https://github.com/opendatalab/Miner-PDF-Benchmark)
端到端的PDF文档理解评估套件,专为大规模模型数据场景而设计
......
import zipfile
import os
import shutil
import json
import markdown_calculate
code_path = os.environ.get('GITHUB_WORKSPACE')
#code_path = "/home/quyuan/actions-runner/_work/Magic-PDF/Magic-PDF.bk"
#评测集存放路径
......@@ -34,8 +36,10 @@ def calculate_score():
os.system(cmd)
cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s" % (code_path, data_path)
os.system(cmd)
cmd = "cd %s && export PYTHONPATH=. && python tools/markdown_calculate.py --tool_name magicpdf --download_dir %s --results %s" % (code_path, data_path, os.path.join(data_path, "result.json"))
os.system(cmd)
score = markdown_calculate.Scoring(os.path.join(data_path, "result.json"))
score.calculate_similarity_total("magicpdf", file_types, data_path)
res = score.summary_scores()
return res
def extrat_zip(zip_file_path, extract_to_path):
......@@ -49,9 +53,24 @@ def extrat_zip(zip_file_path, extract_to_path):
def ci_ben():
fr = open(os.path.join(pdf_dev_path, "ci", "result.json"), "r").read()
if __name__ == "__main__":
lines = fr.readlines()
last_line = lines[-1].strip()
last_score = json.loads(last_line)
print ("last_score:", last_score)
last_simscore = last_score["average_sim_score"]
last_editdistance = last_score["average_edit_distance"]
last_bleu = last_score["average_bleu_score"]
extrat_zip(os.path.join(pdf_dev_path, 'output.zip'), os.path.join(pdf_dev_path))
test_cli()
calculate_score()
now_score = calculate_score()
print ("now_score:", now_score)
now_simscore = now_score["average_sim_score"]
now_editdistance = now_score["average_edit_distance"]
now_bleu = now_score["average_bleu_score"]
assert last_simscore <= now_simscore
assert last_editdistance <= now_editdistance
assert last_bleu <= now_bleu
if __name__ == "__main__":
ci_ben()
......@@ -7,44 +7,16 @@ import re
import scoring
import argparse
parser = argparse.ArgumentParser(description="get directory")
parser.add_argument('--document_types',
nargs='+',
choices=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"],
help='Choose one or more document_types',
default=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
)
parser.add_argument(
"--tool_name",
type=str,
required=True,
help="tool name",
)
parser.add_argument(
"--download_dir",
type=str,
required=True,
help="input download dir",
)
parser.add_argument(
"--results",
type=str,
required=True,
help="results path(end with .json)",
)
args = parser.parse_args()
fw = open(args.results, 'w+', encoding='utf-8')
# 初始化列表来存储编辑距离和BLEU分数
class Scoring:
def __init__(self):
def __init__(self, result_path):
self.edit_distances = []
self.bleu_scores = []
self.sim_scores = []
self.filenames = []
self.score_dict = {}
self.anntion_cnt = 0
self.fw = open(result_path, "w+")
def simple_bleu_score(self, candidate, reference):
candidate_tokens = word_tokenize(candidate)
reference_tokens = word_tokenize(reference)
......@@ -93,12 +65,12 @@ class Scoring:
class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0
class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
ratio = len(class_dict)/total_file
fw.write(f"{tool_type} extract ratio: {ratio}" + "\n")
fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
self.fw.write(f"{tool_type} extract ratio: {ratio}" + "\n")
self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
print (f"{tool_type} extract ratio: {ratio}")
print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
......@@ -115,8 +87,8 @@ class Scoring:
over_all_dict["average_edit_distance"] = average_edit_distance
over_all_dict["average_bleu_score"] = average_bleu_score
over_all_dict["average_sim_score"] = average_sim_score
fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
return over_all_dict
def calculate_similarity_total(self, tool_type, file_types, download_dir):
for file_type in file_types:
......@@ -124,17 +96,3 @@ class Scoring:
actual = os.path.join(download_dir, file_type, tool_type, "cleaned")
self.calculate_similarity(annotion, actual, file_type)
if __name__ == "__main__":
file_types = list()
tool_type =args.tool_name
download_dir = args.download_dir
if args.document_types:
print("Selected types:", args.document_types)
for type_ in args.document_types:
file_types.append(type_)
else:
print("No types selected")
print(f"Type {file_types} is selected. Executing related operations...")
score = Scoring()
score.calculate_similarity_total(tool_type, file_types, download_dir)
score.summary_scores()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment