Commit 4bedda7b authored by quyuan's avatar quyuan

update require txt

parent b474a00a
......@@ -36,8 +36,8 @@ def calculate_score():
os.system(cmd)
cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s" % (code_path, data_path)
os.system(cmd)
score = markdown_calculate.Scoring()
score.calculate_similarity_total("magicpdf", file_types, os.path.join(data_path, "result.json"))
score = markdown_calculate.Scoring(os.path.join(data_path, "result.json"))
score.calculate_similarity_total("magicpdf", file_types, data_path)
res = score.summary_scores()
return res
......
......@@ -7,44 +7,16 @@ import re
import scoring
import argparse
parser = argparse.ArgumentParser(description="get directory")
parser.add_argument('--document_types',
nargs='+',
choices=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"],
help='Choose one or more document_types',
default=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
)
parser.add_argument(
"--tool_name",
type=str,
required=True,
help="tool name",
)
parser.add_argument(
"--download_dir",
type=str,
required=True,
help="input download dir",
)
parser.add_argument(
"--results",
type=str,
required=True,
help="results path(end with .json)",
)
args = parser.parse_args()
fw = open(args.results, 'w+', encoding='utf-8')
# 初始化列表来存储编辑距离和BLEU分数
class Scoring:
def __init__(self):
def __init__(self, result_path):
self.edit_distances = []
self.bleu_scores = []
self.sim_scores = []
self.filenames = []
self.score_dict = {}
self.anntion_cnt = 0
self.fw = open(result_path, "w+")
def simple_bleu_score(self, candidate, reference):
candidate_tokens = word_tokenize(candidate)
reference_tokens = word_tokenize(reference)
......@@ -93,12 +65,12 @@ class Scoring:
class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0
class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
ratio = len(class_dict)/total_file
fw.write(f"{tool_type} extract ratio: {ratio}" + "\n")
fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
self.fw.write(f"{tool_type} extract ratio: {ratio}" + "\n")
self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
print (f"{tool_type} extract ratio: {ratio}")
print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
......@@ -115,7 +87,7 @@ class Scoring:
over_all_dict["average_edit_distance"] = average_edit_distance
over_all_dict["average_bleu_score"] = average_bleu_score
over_all_dict["average_sim_score"] = average_sim_score
fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
return over_all_dict
def calculate_similarity_total(self, tool_type, file_types, download_dir):
......@@ -124,17 +96,3 @@ class Scoring:
actual = os.path.join(download_dir, file_type, tool_type, "cleaned")
self.calculate_similarity(annotion, actual, file_type)
if __name__ == "__main__":
file_types = list()
tool_type =args.tool_name
download_dir = args.download_dir
if args.document_types:
print("Selected types:", args.document_types)
for type_ in args.document_types:
file_types.append(type_)
else:
print("No types selected")
print(f"Type {file_types} is selected. Executing related operations...")
score = Scoring()
score.calculate_similarity_total(tool_type, file_types, download_dir)
score.summary_scores()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment