Commit 80e7a50e authored by quyuan's avatar quyuan

add ci

parent 2e79da59
......@@ -35,6 +35,5 @@ jobs:
- name: get-benchmark-result
run: |
echo "start test"
cd $GITHUB_WORKSPACE/tests/benchmark/
tree
cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_ben.py
"""
bench
"""
import os
import shutil
import json
import calculate_score
code_path = os.environ.get('GITHUB_WORKSPACE')
#评测集存放路径
pdf_dev_path = "datasets/"
#magicpdf跑测结果
pdf_res_path = "/tmp/magic-pdf"
def test_cli():
"""
test pdf-command cli
"""
rm_cmd = f"rm -rf {pdf_res_path}"
os.system(rm_cmd)
os.makedirs(pdf_res_path)
cmd = f'magic-pdf pdf-command --pdf {os.path.join(pdf_dev_path, "mineru")}'
os.system(cmd)
for root, dirs, files in os.walk(pdf_res_path):
for magic_file in files:
target_dir = os.path.join(pdf_dev_path, "mineru")
if magic_file.endswith(".md"):
source_file = os.path.join(root, magic_file)
target_file = os.path.join(pdf_dev_path, "mineru", magic_file)
if not os.path.exists(target_dir):
os.makedirs(target_dir)
shutil.copy(source_file, target_file)
def get_score():
"""
get score
"""
data_path = os.path.join(pdf_dev_path, "ci")
score = calculate_score.Scoring(os.path.join(data_path, "result.json"))
score.calculate_similarity_total("mineru", data_path)
res = score.summary_scores()
return res
def ci_ben():
"""
ci benchmark
"""
try:
fr = open(os.path.join(pdf_dev_path, "result.json"), "r", encoding="utf-8")
lines = fr.readlines()
last_line = lines[-1].strip()
last_score = json.loads(last_line)
print ("last_score:", last_score)
last_simscore = last_score["average_sim_score"]
last_editdistance = last_score["average_edit_distance"]
last_bleu = last_score["average_bleu_score"]
except IOError:
print ("result.json not exist")
test_cli()
os.system(f"python pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}")
now_score = get_score()
print ("now_score:", now_score)
now_simscore = now_score["average_sim_score"]
now_editdistance = now_score["average_edit_distance"]
now_bleu = now_score["average_bleu_score"]
assert last_simscore <= now_simscore
assert last_editdistance <= now_editdistance
assert last_bleu <= now_bleu
if __name__ == "__main__":
os.system("sh env.sh")
ci_ben()
"""
calculate_score
"""
import os
import re
import json
import scoring
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from Levenshtein import distance
class Scoring:
"""
calculate_score
"""
def __init__(self, result_path):
"""
init
"""
self.edit_distances = []
self.bleu_scores = []
self.sim_scores = []
self.filenames = []
self.score_dict = {}
self.anntion_cnt = 0
self.fw = open(result_path, "w+", encoding='utf-8')
def simple_bleu_score(self, candidate, reference):
"""
get bleu score
"""
candidate_tokens = word_tokenize(candidate)
reference_tokens = word_tokenize(reference)
return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1)
def preprocess_string(self, s):
"""
preprocess_string
"""
sub_enter = re.sub(r'\n+', '\n', s)
return re.sub(r' ', ' ', sub_enter)
def calculate_similarity(self, annotion, actual, tool_type):
"""
calculate_similarity
"""
class_dict = {}
edit_distances = []
bleu_scores = []
sim_scores = list()
total_file = 0
for filename in os.listdir(annotion):
if filename.endswith('.md') and not filename.startswith('.'):
total_file = total_file + 1
with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:
content_a = file_a.read()
self.anntion_cnt = self.anntion_cnt + 1
filepath_b = os.path.join(actual, filename)
if os.path.exists(filepath_b):
with open(filepath_b, 'r', encoding='utf-8') as file_b:
content_b = file_b.read()
self.filenames.append(filename)
edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
self.edit_distances.append(edit_dist)
edit_distances.append(edit_dist)
bleu_score = self.simple_bleu_score(content_b, content_a)
bleu_scores.append(bleu_score)
self.bleu_scores.append(bleu_score)
score = scoring.score_text(content_b, content_a)
sim_scores.append(score)
self.sim_scores.append(score)
class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
else:
print(f"File {filename} not found in actual directory.")
class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0
class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
ratio = len(class_dict)/total_file
self.fw.write(f"{tool_type} extract ratio: {ratio}" + "\n")
self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
print (f"{tool_type} extract ratio: {ratio}")
print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
return self.score_dict
def summary_scores(self):
"""
calculate the average of edit distance, bleu score and sim score
"""
over_all_dict = dict()
average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0
average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0
average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
over_all_dict["average_edit_distance"] = average_edit_distance
over_all_dict["average_bleu_score"] = average_bleu_score
over_all_dict["average_sim_score"] = average_sim_score
self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
return over_all_dict
def calculate_similarity_total(self, tool_type, download_dir):
"""
calculate the average of edit distance, bleu score and sim score
"""
annotion = os.path.join(download_dir, "annotations", "cleaned")
actual = os.path.join(download_dir, tool_type, "cleaned")
score = self.calculate_similarity(annotion, actual, tool_type)
return score
# 数学新星问题征解
第十五期 (2016.06)
主持: 牟晓生
第一题. 设 $z_{1}, z_{2}, z_{3}$ 是单位复数. 证明存在单位复数 $z$ 使得:
$$
\frac{1}{\left|z-z_{1}\right|^{2}}+\frac{1}{\left|z-z_{2}\right|^{2}}+\frac{1}{\left|z-z_{3}\right|^{2}} \leq \frac{9}{4}
$$
(湖北武钢三中学生 王逸轩, 上海大学冷岗松 供题)
第二题. 如图, $D$ 是正三角形 $A B C$ 的边 $B C$ 上一点, $B D>C D$. 记 $O_{1}, I_{1}$ 为 $\triangle A B D$ 的外心与内心, $O_{2}, I_{2}$ 为 $\triangle A C D$ 的外心与内心. 圆 $I_{1}$ 与圆 $I_{2}$ 除 $B C$外的另一条外公切线交 $A B, A C$ 于 $P, Q$. 设直线 $P I_{1}$与 $Q I_{2}$ 交于 $R$, 而直线 $O_{1} I_{1}$ 与 $O_{2} I_{2}$ 交于 $T$. 证明: $A T^{2}=A R^{2}+A D \cdot B C$.
(广西钦州 卢圣 供题)
第三题. 给定正整数 $m, n$, 考虑在 $m \times n$ 白棋盘上先将一些格染成黑色. 在之后的每一时刻, 若存在一个白格至少与两个黑格相邻, 则可将它也染成黑色. 求最初至少要染多少个黑色格才能在某一时刻染黑整个棋盘?
(哈佛大学 牟晓生 供题)
第四题. $A B C$ 是一个三角形, 而 $P, Q, R$ 分别是 $B C, C A, A B$ 上的点。证明 $\triangle P Q R$ 的周长不小于 $\triangle A Q R, \triangle B R P, \triangle C P Q$ 周长的最小值.
(哈佛大学 牟晓生 供题)
conda create -n MinerU python=3.10
conda activate MinerU
pip install magic-pdf
#cp magic-pdf.template.json ~/magic-pdf.json
\ No newline at end of file
{
"bucket_info":{
"bucket-name-1":["ak", "sk", "endpoint"],
"bucket-name-2":["ak", "sk", "endpoint"]
},
"temp-output-dir":"/tmp",
"models-dir":"/tmp/models",
"device-mode":"cpu"
}
\ No newline at end of file
"""
clean data
"""
import argparse
import os
import re
import htmltabletomd # type: ignore
import pypandoc
import argparse
parser = argparse.ArgumentParser(description="get tool type")
parser.add_argument(
"--tool_name",
type=str,
required=True,
help="input tool name",
)
parser.add_argument(
"--download_dir",
type=str,
required=True,
help="input download dir",
)
args = parser.parse_args()
def clean_markdown_images(content):
"""
clean markdown images
"""
pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)
cleaned_content = pattern.sub('', content)
return cleaned_content
def clean_ocrmath_photo(content):
"""
clean ocrmath photo
"""
pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)
cleaned_content = pattern.sub('', content)
return cleaned_content
def convert_html_table_to_md(html_table):
"""
convert html table to markdown table
"""
lines = html_table.strip().split('\n')
md_table = ''
if lines and '<tr>' in lines[0]:
in_thead = True
for line in lines:
if '<th>' in line:
cells = re.findall(r'<th>(.*?)</th>', line)
md_table += '| ' + ' | '.join(cells) + ' |\n'
in_thead = False
elif '<td>' in line and not in_thead:
cells = re.findall(r'<td>(.*?)</td>', line)
md_table += '| ' + ' | '.join(cells) + ' |\n'
md_table = md_table.rstrip() + '\n'
return md_table
def convert_latext_to_md(content):
"""
convert latex table to markdown table
"""
tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)
placeholders = []
for table in tables:
placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
content = content.replace(replace_str, placeholder)
try:
pypandoc.convert_text(replace_str, format="latex", to="md", outputfile="output.md", encoding="utf-8")
except:
markdown_string = replace_str
else:
markdown_string = open('output.md', 'r', encoding='utf-8').read()
placeholders.append((placeholder, markdown_string))
new_content = content
for placeholder, md_table in placeholders:
new_content = new_content.replace(placeholder, md_table)
# 写入文件
return new_content
def convert_htmltale_to_md(content):
"""
convert html table to markdown table
"""
tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)
placeholders = []
for table in tables:
placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
content = content.replace(f"<table>{table}</table>", placeholder)
try:
convert_table = htmltabletomd.convert_table(table)
except:
convert_table = table
placeholders.append((placeholder,convert_table))
new_content = content
for placeholder, md_table in placeholders:
new_content = new_content.replace(placeholder, md_table)
# 写入文件
return new_content
def clean_data(prod_type, download_dir):
"""
clean data
"""
tgt_dir = os.path.join(download_dir, prod_type, "cleaned")
if not os.path.exists(tgt_dir):
os.makedirs(tgt_dir)
source_dir = os.path.join(download_dir, prod_type)
filenames = os.listdir(source_dir)
for filename in filenames:
if filename.endswith('.md'):
input_file = os.path.join(source_dir, filename)
output_file = os.path.join(tgt_dir, "cleaned_" + filename)
with open(input_file, 'r', encoding='utf-8') as fr:
content = fr.read()
new_content = clean_markdown_images(content)
new_content = convert_html_table_to_md(new_content)
new_content = convert_latext_to_md(new_content)
new_content = convert_htmltale_to_md(new_content)
with open(output_file, 'w', encoding='utf-8') as fw:
fw.write(new_content)
if __name__ == '__main__':
tool_type = args.tool_name
download_dir = args.download_dir
clean_data(tool_type, download_dir)
{"average_sim_score":0, "average_edit_distance":0, "average_bleu_score": 0}
\ No newline at end of file
import math
from rapidfuzz import fuzz
import re
import regex
from statistics import mean
CHUNK_MIN_CHARS = 25
def chunk_text(text, chunk_len=500):
chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
return chunks
def overlap_score(hypothesis_chunks, reference_chunks):
if len(reference_chunks) > 0:
length_modifier = len(hypothesis_chunks) / len(reference_chunks)
else:
length_modifier = 0
search_distance = max(len(reference_chunks) // 5, 10)
chunk_scores = []
for i, hyp_chunk in enumerate(hypothesis_chunks):
max_score = 0
total_len = 0
i_offset = int(i * length_modifier)
chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
for j in chunk_range:
ref_chunk = reference_chunks[j]
score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
if score > max_score:
max_score = score
total_len = len(ref_chunk)
chunk_scores.append(max_score)
return chunk_scores
def score_text(hypothesis, reference):
# Returns a 0-1 alignment score
hypothesis_chunks = chunk_text(hypothesis)
reference_chunks = chunk_text(reference)
chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
if len(chunk_scores) > 0:
mean_score = mean(chunk_scores)
return mean_score
else:
return 0
#return mean(chunk_scores)
\ No newline at end of file
......@@ -2,6 +2,6 @@ import os
conf = {
"code_path": os.environ.get('GITHUB_WORKSPACE'),
"pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
"pdf_res_path": "/tmp"
"pdf_res_path": "/tmp/magic-pdf"
}
This diff is collapsed.
......@@ -6,35 +6,27 @@ from lib import common
import logging
import os
import json
from loguru import logger
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
pdf_res_path = conf.conf["pdf_res_path"]
code_path = conf.conf["code_path"]
pdf_dev_path = conf.conf["pdf_dev_path"]
class TestCli:
def test_pdf_specify_dir(self):
"""
输入pdf和指定目录的模型结果
test cli
"""
cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py pdf-command --pdf {}' % (code_path, pdf_dev_path)
logging.info(cmd)
common.check_shell(cmd)
#common.count_folders_and_check_contents(pdf_res_path)
def test_pdf_sdk(self):
"""
pdf sdk 方式解析
"""
demo_names = list()
for pdf_file in os.listdir(pdf_dev_path):
pdf_path = os.path.join(pdf_dev_path, "pdf")
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
model_path = os.path.join(pdf_dev_path, f"{demo_name}.json")
model_path = os.path.join(pdf_dev_path, f"{demo_name}_model.json")
pdf_path = os.path.join(pdf_dev_path, f"{demo_name}.pdf")
pdf_bytes = open(pdf_path, "rb").read()
model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
......@@ -45,9 +37,11 @@ class TestCli:
pipe.pipe_classify()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
res_path = os.path.join(pdf_dev_path, "miner", f"{demo_name}.md")
with open(res_path, "w", encoding="utf-8") as f:
f.write(md_content)
# def test_pdf_specify_jsonl(self):
# """
# 输入jsonl, 默认方式解析
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment