Commit cf3e8519 authored by 赵小蒙's avatar 赵小蒙

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	magic_pdf/libs/pdf_image_tools.py
parents 7b937d58 d867304f
......@@ -40,12 +40,15 @@ jobs:
pip install -r requirements.txt
fi
- name: benchmark
- name: config-net-reset
run: |
export http_proxy=""
export https_proxy=""
- name: get-benchmark-result
run: |
echo "start test"
cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_badcase text_overall base_data_text.json
python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_badcase ocr_overall base_data_ocr.json
cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_badcase text_overall base_data_text.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_badcase ocr_overall base_data_ocr.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
......
......@@ -50,7 +50,7 @@ def get_pdf_parse_method(method):
def prepare_env():
local_parent_dir = os.path.join(
get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d")
)
local_image_dir = os.path.join(local_parent_dir, "images")
......@@ -132,7 +132,7 @@ def pdf_command(pdf, model, method):
local_image_dir, _ = prepare_env()
local_image_rw = DiskReaderWriter(local_image_dir)
parse = get_pdf_parse_method(method)
parse(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True)
parse(pdf_data, jso, local_image_rw, is_debug=True)
if __name__ == "__main__":
......
......@@ -35,6 +35,9 @@ class DiskReaderWriter(AbsReaderWriter):
abspath = path
else:
abspath = os.path.join(self.path, path)
directory_path = os.path.dirname(abspath)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
if mode == MODE_TXT:
with open(abspath, "w", encoding=self.encoding) as f:
f.write(content)
......@@ -53,11 +56,11 @@ class DiskReaderWriter(AbsReaderWriter):
# 使用示例
if __name__ == "__main__":
file_path = "io/example.txt"
file_path = "io/test/example.txt"
drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
# 写入内容到文件
drw.write(b"Hello, World!", path="io/example.txt", mode="binary")
drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
# 从文件读取内容
content = drw.read(path=file_path)
......
......@@ -17,7 +17,7 @@ def read_config():
config_file = os.path.join(home_dir, "magic-pdf.json")
if not os.path.exists(config_file):
raise Exception("magic-pdf.json not found")
raise Exception(f"{config_file} not found")
with open(config_file, "r") as f:
config = json.load(f)
......
......@@ -5,7 +5,7 @@ from magic_pdf.libs.commons import join_path
from magic_pdf.libs.hash_utils import compute_sha256
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter):
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter:AbsReaderWriter):
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
......
# 工具脚本使用说明
......@@ -756,7 +756,7 @@ def merge_json_data(json_test_df, json_standard_df):
return inner_merge, standard_exist, test_exist
def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
def save_results(result_dict,overall_report_dict,badcase_path,overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url):
"""
将结果字典保存为JSON文件至指定路径。
......@@ -764,18 +764,19 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
- result_dict: 包含计算结果的字典。
- overall_path: 结果文件的保存路径,包括文件名。
"""
with open(overall_path, 'w', encoding='utf-8') as f:
# 将结果字典转换为JSON格式并写入文件
json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
final_overall_path = upload_to_s3(overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
overall_path_res = "OCR抽取方案整体评测指标结果请查看:" + final_overall_path
print(f'\033[31m{overall_path_res}\033[0m')
# 打开指定的文件以写入
with open(badcase_path, 'w', encoding='utf-8') as f:
# 将结果字典转换为JSON格式并写入文件
json.dump(result_dict, f, ensure_ascii=False, indent=4)
print(f"计算结果已经保存到文件:{badcase_path}")
with open(overall_path, 'w', encoding='utf-8') as f:
# 将结果字典转换为JSON格式并写入文件
json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
print(f"计算结果已经保存到文件:{overall_path}")
final_badcase_path = upload_to_s3(badcase_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
badcase_path_res = "OCR抽取方案评测badcase输出报告查看:" + final_badcase_path
print(f'\033[31m{badcase_path_res}\033[0m')
def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL):
"""
......@@ -792,8 +793,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
# 上传文件到S3
s3.upload_file(file_path, bucket_name, s3_object_key)
print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
s3_path = f"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
return s3_path
#print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
except FileNotFoundError:
print(f"文件 {file_path} 未找到,请检查文件路径是否正确。")
except NoCredentialsError:
......@@ -801,6 +803,7 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
except ClientError as e:
print(f"上传文件时发生错误:{e}")
def generate_filename(badcase_path,overall_path):
"""
生成带有当前时间戳的输出文件名。
......@@ -864,17 +867,19 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
badcase_file,overall_file = generate_filename(badcase_path,overall_path)
# 保存结果到JSON文件
save_results(result_dict, overall_report_dict,badcase_file,overall_file)
#save_results(result_dict, overall_report_dict,badcase_file,overall_file)
save_results(result_dict, overall_report_dict,badcase_file,overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
result=compare_edit_distance(base_data_path, overall_report_dict)
"""
if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
try:
upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
except Exception as e:
print(f"上传到S3时发生错误: {e}")
print(result)
"""
#print(result)
assert result == 1
if __name__ == "__main__":
......
......@@ -768,7 +768,7 @@ def merge_json_data(json_test_df, json_standard_df):
return inner_merge, standard_exist, test_exist
def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
def save_results(result_dict,overall_report_dict,badcase_path,overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url):
"""
将结果字典保存为JSON文件至指定路径。
......@@ -776,18 +776,21 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
- result_dict: 包含计算结果的字典。
- overall_path: 结果文件的保存路径,包括文件名。
"""
with open(overall_path, 'w', encoding='utf-8') as f:
# 将结果字典转换为JSON格式并写入文件
json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
final_overall_path = upload_to_s3(overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
overall_path_res = "文本型PDF抽取方案整体评测指标结果请查看:" + final_overall_path
print(f'\033[31m{overall_path_res}\033[0m')
# 打开指定的文件以写入
with open(badcase_path, 'w', encoding='utf-8') as f:
# 将结果字典转换为JSON格式并写入文件
json.dump(result_dict, f, ensure_ascii=False, indent=4)
final_badcase_path = upload_to_s3(badcase_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
badcase_path_res = "文本型PDF抽取方案评测badcase输出报告查看:" + final_badcase_path
print(f'\033[31m{badcase_path_res}\033[0m')
print(f"计算结果已经保存到文件:{badcase_path}")
with open(overall_path, 'w', encoding='utf-8') as f:
# 将结果字典转换为JSON格式并写入文件
json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
print(f"计算结果已经保存到文件:{overall_path}")
def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL):
......@@ -805,8 +808,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
# 上传文件到S3
s3.upload_file(file_path, bucket_name, s3_object_key)
print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
s3_path = f"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
return s3_path
#print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
except FileNotFoundError:
print(f"文件 {file_path} 未找到,请检查文件路径是否正确。")
except NoCredentialsError:
......@@ -875,17 +879,17 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
badcase_file,overall_file = generate_filename(badcase_path,overall_path)
# 保存结果到JSON文件
save_results(result_dict, overall_report_dict,badcase_file,overall_file)
save_results(result_dict, overall_report_dict,badcase_file,overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
result=compare_edit_distance(base_data_path, overall_report_dict)
"""
if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
try:
upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
except Exception as e:
print(f"上传到S3时发生错误: {e}")
print(result)
"""
assert result == 1
if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment