Unverified Commit 351078f1 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub

Merge pull request #141 from dt-yy/master

add ci
parents 833189a3 4fcaf679
...@@ -4,12 +4,12 @@ calculate_score ...@@ -4,12 +4,12 @@ calculate_score
import os import os
import re import re
import json import json
from Levenshtein import distance
from lib import scoring from lib import scoring
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
import nltk import nltk
nltk.download('punkt') nltk.download('punkt')
from Levenshtein import distance
class Scoring: class Scoring:
""" """
......
...@@ -118,9 +118,6 @@ def clean_data(prod_type, download_dir): ...@@ -118,9 +118,6 @@ def clean_data(prod_type, download_dir):
with open(input_file, 'r', encoding='utf-8') as fr: with open(input_file, 'r', encoding='utf-8') as fr:
content = fr.read() content = fr.read()
new_content = clean_markdown_images(content) new_content = clean_markdown_images(content)
new_content = convert_html_table_to_md(new_content)
new_content = convert_latext_to_md(new_content)
new_content = convert_htmltale_to_md(new_content)
with open(output_file, 'w', encoding='utf-8') as fw: with open(output_file, 'w', encoding='utf-8') as fw:
fw.write(new_content) fw.write(new_content)
......
"""
Calculate simscore, refer to (https://github.com/VikParuchuri/marker?tab=readme-ov-file)
"""
import math import math
from rapidfuzz import fuzz from rapidfuzz import fuzz
......
{"average_sim_score":0.6505598645664856, "average_edit_distance":0.2514908429188901, "average_bleu_score": 0.5808819533975296}
\ No newline at end of file
...@@ -11,9 +11,6 @@ from conf import conf ...@@ -11,9 +11,6 @@ from conf import conf
code_path = os.environ.get('GITHUB_WORKSPACE') code_path = os.environ.get('GITHUB_WORKSPACE')
pdf_dev_path = conf.conf["pdf_dev_path"] pdf_dev_path = conf.conf["pdf_dev_path"]
pdf_res_path = conf.conf["pdf_res_path"] pdf_res_path = conf.conf["pdf_res_path"]
last_simscore = 0
last_editdistance = 0
last_bleu = 0
class TestBench(): class TestBench():
""" """
...@@ -23,7 +20,6 @@ class TestBench(): ...@@ -23,7 +20,6 @@ class TestBench():
""" """
ci benchmark ci benchmark
""" """
try:
fr = open(os.path.join(pdf_dev_path, "result.json"), "r", encoding="utf-8") fr = open(os.path.join(pdf_dev_path, "result.json"), "r", encoding="utf-8")
lines = fr.readlines() lines = fr.readlines()
last_line = lines[-1].strip() last_line = lines[-1].strip()
...@@ -31,8 +27,6 @@ class TestBench(): ...@@ -31,8 +27,6 @@ class TestBench():
last_simscore = last_score["average_sim_score"] last_simscore = last_score["average_sim_score"]
last_editdistance = last_score["average_edit_distance"] last_editdistance = last_score["average_edit_distance"]
last_bleu = last_score["average_bleu_score"] last_bleu = last_score["average_bleu_score"]
except IOError:
print ("result.json not exist")
os.system(f"python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}") os.system(f"python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}")
now_score = get_score() now_score = get_score()
print ("now_score:", now_score) print ("now_score:", now_score)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment