Commit cf3e8519 authored by 赵小蒙's avatar 赵小蒙

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	magic_pdf/libs/pdf_image_tools.py
parents 7b937d58 d867304f
...@@ -40,12 +40,15 @@ jobs: ...@@ -40,12 +40,15 @@ jobs:
pip install -r requirements.txt pip install -r requirements.txt
fi fi
- name: config-net-reset
- name: benchmark run: |
export http_proxy=""
export https_proxy=""
- name: get-benchmark-result
run: | run: |
echo "start test" echo "start test"
cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_badcase text_overall base_data_text.json cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_badcase text_overall base_data_text.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_badcase ocr_overall base_data_ocr.json python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_badcase ocr_overall base_data_ocr.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
notify_to_feishu: notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }} if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
......
...@@ -50,7 +50,7 @@ def get_pdf_parse_method(method): ...@@ -50,7 +50,7 @@ def get_pdf_parse_method(method):
def prepare_env(): def prepare_env():
local_parent_dir = os.path.join( local_parent_dir = os.path.join(
get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d-%H-%M-%S") get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d")
) )
local_image_dir = os.path.join(local_parent_dir, "images") local_image_dir = os.path.join(local_parent_dir, "images")
...@@ -132,7 +132,7 @@ def pdf_command(pdf, model, method): ...@@ -132,7 +132,7 @@ def pdf_command(pdf, model, method):
local_image_dir, _ = prepare_env() local_image_dir, _ = prepare_env()
local_image_rw = DiskReaderWriter(local_image_dir) local_image_rw = DiskReaderWriter(local_image_dir)
parse = get_pdf_parse_method(method) parse = get_pdf_parse_method(method)
parse(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True) parse(pdf_data, jso, local_image_rw, is_debug=True)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -35,6 +35,9 @@ class DiskReaderWriter(AbsReaderWriter): ...@@ -35,6 +35,9 @@ class DiskReaderWriter(AbsReaderWriter):
abspath = path abspath = path
else: else:
abspath = os.path.join(self.path, path) abspath = os.path.join(self.path, path)
directory_path = os.path.dirname(abspath)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
if mode == MODE_TXT: if mode == MODE_TXT:
with open(abspath, "w", encoding=self.encoding) as f: with open(abspath, "w", encoding=self.encoding) as f:
f.write(content) f.write(content)
...@@ -53,11 +56,11 @@ class DiskReaderWriter(AbsReaderWriter): ...@@ -53,11 +56,11 @@ class DiskReaderWriter(AbsReaderWriter):
# 使用示例 # 使用示例
if __name__ == "__main__": if __name__ == "__main__":
file_path = "io/example.txt" file_path = "io/test/example.txt"
drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf") drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
# 写入内容到文件 # 写入内容到文件
drw.write(b"Hello, World!", path="io/example.txt", mode="binary") drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
# 从文件读取内容 # 从文件读取内容
content = drw.read(path=file_path) content = drw.read(path=file_path)
......
...@@ -17,7 +17,7 @@ def read_config(): ...@@ -17,7 +17,7 @@ def read_config():
config_file = os.path.join(home_dir, "magic-pdf.json") config_file = os.path.join(home_dir, "magic-pdf.json")
if not os.path.exists(config_file): if not os.path.exists(config_file):
raise Exception("magic-pdf.json not found") raise Exception(f"{config_file} not found")
with open(config_file, "r") as f: with open(config_file, "r") as f:
config = json.load(f) config = json.load(f)
......
...@@ -5,7 +5,7 @@ from magic_pdf.libs.commons import join_path ...@@ -5,7 +5,7 @@ from magic_pdf.libs.commons import join_path
from magic_pdf.libs.hash_utils import compute_sha256 from magic_pdf.libs.hash_utils import compute_sha256
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter): def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter:AbsReaderWriter):
""" """
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。 save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
......
# 工具脚本使用说明
...@@ -756,7 +756,7 @@ def merge_json_data(json_test_df, json_standard_df): ...@@ -756,7 +756,7 @@ def merge_json_data(json_test_df, json_standard_df):
return inner_merge, standard_exist, test_exist return inner_merge, standard_exist, test_exist
def save_results(result_dict,overall_report_dict,badcase_path,overall_path,): def save_results(result_dict,overall_report_dict,badcase_path,overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url):
""" """
将结果字典保存为JSON文件至指定路径。 将结果字典保存为JSON文件至指定路径。
...@@ -764,18 +764,19 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,): ...@@ -764,18 +764,19 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
- result_dict: 包含计算结果的字典。 - result_dict: 包含计算结果的字典。
- overall_path: 结果文件的保存路径,包括文件名。 - overall_path: 结果文件的保存路径,包括文件名。
""" """
with open(overall_path, 'w', encoding='utf-8') as f:
# 将结果字典转换为JSON格式并写入文件
json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
final_overall_path = upload_to_s3(overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
overall_path_res = "OCR抽取方案整体评测指标结果请查看:" + final_overall_path
print(f'\033[31m{overall_path_res}\033[0m')
# 打开指定的文件以写入 # 打开指定的文件以写入
with open(badcase_path, 'w', encoding='utf-8') as f: with open(badcase_path, 'w', encoding='utf-8') as f:
# 将结果字典转换为JSON格式并写入文件 # 将结果字典转换为JSON格式并写入文件
json.dump(result_dict, f, ensure_ascii=False, indent=4) json.dump(result_dict, f, ensure_ascii=False, indent=4)
final_badcase_path = upload_to_s3(badcase_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
print(f"计算结果已经保存到文件:{badcase_path}") badcase_path_res = "OCR抽取方案评测badcase输出报告查看:" + final_badcase_path
print(f'\033[31m{badcase_path_res}\033[0m')
with open(overall_path, 'w', encoding='utf-8') as f:
# 将结果字典转换为JSON格式并写入文件
json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
print(f"计算结果已经保存到文件:{overall_path}")
def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL): def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL):
""" """
...@@ -792,8 +793,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE ...@@ -792,8 +793,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
# 上传文件到S3 # 上传文件到S3
s3.upload_file(file_path, bucket_name, s3_object_key) s3.upload_file(file_path, bucket_name, s3_object_key)
s3_path = f"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}") return s3_path
#print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
except FileNotFoundError: except FileNotFoundError:
print(f"文件 {file_path} 未找到,请检查文件路径是否正确。") print(f"文件 {file_path} 未找到,请检查文件路径是否正确。")
except NoCredentialsError: except NoCredentialsError:
...@@ -801,6 +803,7 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE ...@@ -801,6 +803,7 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
except ClientError as e: except ClientError as e:
print(f"上传文件时发生错误:{e}") print(f"上传文件时发生错误:{e}")
def generate_filename(badcase_path,overall_path): def generate_filename(badcase_path,overall_path):
""" """
生成带有当前时间戳的输出文件名。 生成带有当前时间戳的输出文件名。
...@@ -864,17 +867,19 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat ...@@ -864,17 +867,19 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
badcase_file,overall_file = generate_filename(badcase_path,overall_path) badcase_file,overall_file = generate_filename(badcase_path,overall_path)
# 保存结果到JSON文件 # 保存结果到JSON文件
save_results(result_dict, overall_report_dict,badcase_file,overall_file) #save_results(result_dict, overall_report_dict,badcase_file,overall_file)
save_results(result_dict, overall_report_dict,badcase_file,overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
result=compare_edit_distance(base_data_path, overall_report_dict) result=compare_edit_distance(base_data_path, overall_report_dict)
"""
if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]): if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
try: try:
upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url) upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url) upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
except Exception as e: except Exception as e:
print(f"上传到S3时发生错误: {e}") print(f"上传到S3时发生错误: {e}")
print(result) """
#print(result)
assert result == 1 assert result == 1
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -768,7 +768,7 @@ def merge_json_data(json_test_df, json_standard_df): ...@@ -768,7 +768,7 @@ def merge_json_data(json_test_df, json_standard_df):
return inner_merge, standard_exist, test_exist return inner_merge, standard_exist, test_exist
def save_results(result_dict,overall_report_dict,badcase_path,overall_path,): def save_results(result_dict,overall_report_dict,badcase_path,overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url):
""" """
将结果字典保存为JSON文件至指定路径。 将结果字典保存为JSON文件至指定路径。
...@@ -776,18 +776,21 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,): ...@@ -776,18 +776,21 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
- result_dict: 包含计算结果的字典。 - result_dict: 包含计算结果的字典。
- overall_path: 结果文件的保存路径,包括文件名。 - overall_path: 结果文件的保存路径,包括文件名。
""" """
with open(overall_path, 'w', encoding='utf-8') as f:
# 将结果字典转换为JSON格式并写入文件
json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
final_overall_path = upload_to_s3(overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
overall_path_res = "文本型PDF抽取方案整体评测指标结果请查看:" + final_overall_path
print(f'\033[31m{overall_path_res}\033[0m')
# 打开指定的文件以写入 # 打开指定的文件以写入
with open(badcase_path, 'w', encoding='utf-8') as f: with open(badcase_path, 'w', encoding='utf-8') as f:
# 将结果字典转换为JSON格式并写入文件 # 将结果字典转换为JSON格式并写入文件
json.dump(result_dict, f, ensure_ascii=False, indent=4) json.dump(result_dict, f, ensure_ascii=False, indent=4)
final_badcase_path = upload_to_s3(badcase_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
badcase_path_res = "文本型PDF抽取方案评测badcase输出报告查看:" + final_badcase_path
print(f'\033[31m{badcase_path_res}\033[0m')
print(f"计算结果已经保存到文件:{badcase_path}")
with open(overall_path, 'w', encoding='utf-8') as f:
# 将结果字典转换为JSON格式并写入文件
json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
print(f"计算结果已经保存到文件:{overall_path}")
def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL): def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL):
...@@ -805,8 +808,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE ...@@ -805,8 +808,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
# 上传文件到S3 # 上传文件到S3
s3.upload_file(file_path, bucket_name, s3_object_key) s3.upload_file(file_path, bucket_name, s3_object_key)
s3_path = f"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}") return s3_path
#print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
except FileNotFoundError: except FileNotFoundError:
print(f"文件 {file_path} 未找到,请检查文件路径是否正确。") print(f"文件 {file_path} 未找到,请检查文件路径是否正确。")
except NoCredentialsError: except NoCredentialsError:
...@@ -875,17 +879,17 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat ...@@ -875,17 +879,17 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
badcase_file,overall_file = generate_filename(badcase_path,overall_path) badcase_file,overall_file = generate_filename(badcase_path,overall_path)
# 保存结果到JSON文件 # 保存结果到JSON文件
save_results(result_dict, overall_report_dict,badcase_file,overall_file) save_results(result_dict, overall_report_dict,badcase_file,overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
result=compare_edit_distance(base_data_path, overall_report_dict) result=compare_edit_distance(base_data_path, overall_report_dict)
"""
if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]): if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
try: try:
upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url) upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url) upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
except Exception as e: except Exception as e:
print(f"上传到S3时发生错误: {e}") print(f"上传到S3时发生错误: {e}")
print(result) """
assert result == 1 assert result == 1
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment