CI yaml

194f34af · quyuan · a6b22ae5 · a6b22ae5 · a6b22ae5 · 194f34af
Commit 194f34af authored Apr 15, 2024 by quyuan
Showing with 35 additions and 201 deletions

base_data_ocr.json tools/base_data_ocr.json +0 -87

base_data_text.json tools/base_data_text.json +0 -88

ocr_badcase.py tools/ocr_badcase.py +19 -14

text_badcase.py tools/text_badcase.py +16 -12

No files found.
--- a/tools/base_data_ocr.json
+++ b/tools/base_data_ocr.json
-{
-    "accuracy": 1.0,
-    "precision": 1.0,
-    "recall": 1.0,
-    "f1_score": 1.0,
-    "pdf间的平均编辑距离": 133.10256410256412,
-    "pdf间的平均bleu": 0.28838311595434046,
-    "分段准确率": 0.07220216606498195,
-    "行内公式准确率": {
-        "accuracy": 0.004835727492533068,
-        "precision": 0.008790072388831437,
-        "recall": 0.010634970284641852,
-        "f1_score": 0.009624911535739562
-    },
-    "行内公式编辑距离": 1.6176470588235294,
-    "行内公式bleu": 0.17154724654721457,
-    "行间公式准确率": {
-        "accuracy": 0.08490566037735849,
-        "precision": 0.1836734693877551,
-        "recall": 0.13636363636363635,
-        "f1_score": 0.1565217391304348
-    },
-    "行间公式编辑距离": 113.22222222222223,
-    "行间公式bleu": 0.2531053359913409,
-    "丢弃文本准确率": {
-        "accuracy": 0.00035398230088495576,
-        "precision": 0.0006389776357827476,
-        "recall": 0.0007930214115781126,
-        "f1_score": 0.0007077140835102619
-    },
-    "丢弃文本标签准确率": {
-        "color_background_header_txt_block": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 41.0
-        },
-        "header": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 4.0
-        },
-        "footnote": {
-            "precision": 1.0,
-            "recall": 0.009708737864077669,
-            "f1-score": 0.019230769230769232,
-            "support": 103.0
-        },
-        "on-table": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 665.0
-        },
-        "rotate": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 63.0
-        },
-        "on-image": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 380.0
-        },
-        "micro avg": {
-            "precision": 1.0,
-            "recall": 0.0007961783439490446,
-            "f1-score": 0.0015910898965791568,
-            "support": 1256.0
-        }
-    },
-    "丢弃图片准确率": {
-        "accuracy": 0.0,
-        "precision": 0.0,
-        "recall": 0.0,
-        "f1_score": 0.0
-    },
-    "丢弃表格准确率": {
-        "accuracy": 0.0,
-        "precision": 0.0,
-        "recall": 0.0,
-        "f1_score": 0.0
-    }
-}
\ No newline at end of file
--- a/tools/base_data_text.json
+++ b/tools/base_data_text.json
-{
-    "accuracy": 1.0,
-    "precision": 1.0,
-    "recall": 1.0,
-    "f1_score": 1.0,
-    "pdf间的平均编辑距离": 19.82051282051282,
-    "pdf间的平均bleu": 0.9002485609584511,
-    "阅读顺序编辑距离": 0.3176895306859206,
-    "分段准确率": 0.8989169675090253,
-    "行内公式准确率": {
-        "accuracy": 0.9782741738066095,
-        "precision": 0.9782741738066095,
-        "recall": 1.0,
-        "f1_score": 0.9890177880897139
-    },
-    "行内公式编辑距离": 0.0,
-    "行内公式bleu": 0.20340450120213166,
-    "行间公式准确率": {
-        "accuracy": 1.0,
-        "precision": 1.0,
-        "recall": 1.0,
-        "f1_score": 1.0
-    },
-    "行间公式编辑距离": 0.0,
-    "行间公式bleu": 0.3662262622386575,
-    "丢弃文本准确率": {
-        "accuracy": 0.867870036101083,
-        "precision": 0.9064856711915535,
-        "recall": 0.9532117367168914,
-        "f1_score": 0.9292616930807885
-    },
-    "丢弃文本标签准确率": {
-        "color_background_header_txt_block": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 41.0
-        },
-        "rotate": {
-            "precision": 1.0,
-            "recall": 0.9682539682539683,
-            "f1-score": 0.9838709677419355,
-            "support": 63.0
-        },
-        "footnote": {
-            "precision": 1.0,
-            "recall": 0.883495145631068,
-            "f1-score": 0.9381443298969072,
-            "support": 103.0
-        },
-        "header": {
-            "precision": 1.0,
-            "recall": 1.0,
-            "f1-score": 1.0,
-            "support": 4.0
-        },
-        "on-image": {
-            "precision": 0.9947643979057592,
-            "recall": 1.0,
-            "f1-score": 0.9973753280839895,
-            "support": 380.0
-        },
-        "on-table": {
-            "precision": 1.0,
-            "recall": 0.9443609022556391,
-            "f1-score": 0.97138437741686,
-            "support": 665.0
-        },
-        "micro avg": {
-            "precision": 0.9982847341337907,
-            "recall": 0.9267515923566879,
-            "f1-score": 0.9611890999174236,
-            "support": 1256.0
-        }
-    },
-    "丢弃图片准确率": {
-        "accuracy": 0.8666666666666667,
-        "precision": 0.9285714285714286,
-        "recall": 0.9285714285714286,
-        "f1_score": 0.9285714285714286
-    },
-    "丢弃表格准确率": {
-        "accuracy": 0,
-        "precision": 0,
-        "recall": 0,
-        "f1_score": 0
-    }
-}
\ No newline at end of file
--- a/tools/ocr_badcase.py
+++ b/tools/ocr_badcase.py
@@ -756,7 +756,7 @@ def merge_json_data(json_test_df, json_standard_df):

    return inner_merge, standard_exist, test_exist

-def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
+def save_results(result_dict,overall_report_dict,badcase_path,overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url):
    """
    将结果字典保存为JSON文件至指定路径。

@@ -764,19 +764,20 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
    - result_dict: 包含计算结果的字典。
    - overall_path: 结果文件的保存路径，包括文件名。
    """
+    with open(overall_path, 'w', encoding='utf-8') as f:
+    # 将结果字典转换为JSON格式并写入文件
+        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
+    final_overall_path = upload_to_s3(overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    overall_path_res = "OCR抽取方案整体评测指标结果请查看：" + final_overall_path
+    print(f'\033[31m{overall_path_res}\033[0m')
    # 打开指定的文件以写入
    with open(badcase_path, 'w', encoding='utf-8') as f:
        # 将结果字典转换为JSON格式并写入文件
        json.dump(result_dict, f, ensure_ascii=False, indent=4)
-    badcase_path_res = "OCR抽取方案评测badcase输出报告查看：" + badcase_path
+    final_badcase_path = upload_to_s3(badcase_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    badcase_path_res = "OCR抽取方案评测badcase输出报告查看：" + final_badcase_path
    print(f'\033[31m{badcase_path_res}\033[0m')

-    with open(overall_path, 'w', encoding='utf-8') as f:
-    # 将结果字典转换为JSON格式并写入文件
-        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
-    overall_path_res = "OCR抽取方案整体评测指标结果请查看：" + overall_path
-    print(f'\033[31m{overall_path_res}\033[0m')
-
 def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL):
    """
    上传文件到Amazon S3
@@ -792,8 +793,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
        
        # 上传文件到S3
        s3.upload_file(file_path, bucket_name, s3_object_key)
-        
-        print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory}，文件名为 {file_name}")
+        s3_path = f"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
+        return s3_path
+        #print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory}，文件名为 {file_name}")
    except FileNotFoundError:
        print(f"文件 {file_path} 未找到，请检查文件路径是否正确。")
    except NoCredentialsError:
@@ -801,6 +803,7 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
    except ClientError as e:
        print(f"上传文件时发生错误：{e}")

+
 def generate_filename(badcase_path,overall_path):
    """
    生成带有当前时间戳的输出文件名。
@@ -864,17 +867,19 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
    badcase_file,overall_file = generate_filename(badcase_path,overall_path)

    # 保存结果到JSON文件
-    save_results(result_dict, overall_report_dict,badcase_file,overall_file)
+    #save_results(result_dict, overall_report_dict,badcase_file,overall_file)
+    save_results(result_dict, overall_report_dict,badcase_file,overall_file,  s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)

    result=compare_edit_distance(base_data_path, overall_report_dict)
-
+    """
    if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
        try:
            upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
            upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
        except Exception as e:
            print(f"上传到S3时发生错误: {e}")    
-    print(result)
+    """
+    #print(result)
    assert result == 1

 if __name__ == "__main__":

--- a/tools/text_badcase.py
+++ b/tools/text_badcase.py
@@ -768,7 +768,7 @@ def merge_json_data(json_test_df, json_standard_df):

    return inner_merge, standard_exist, test_exist

-def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
+def save_results(result_dict,overall_report_dict,badcase_path,overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url):
    """
    将结果字典保存为JSON文件至指定路径。

@@ -776,18 +776,21 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
    - result_dict: 包含计算结果的字典。
    - overall_path: 结果文件的保存路径，包括文件名。
    """
+    with open(overall_path, 'w', encoding='utf-8') as f:
+    # 将结果字典转换为JSON格式并写入文件
+        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
+    final_overall_path = upload_to_s3(overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    overall_path_res = "文本型PDF抽取方案整体评测指标结果请查看：" + final_overall_path
+    print(f'\033[31m{overall_path_res}\033[0m')
    # 打开指定的文件以写入
    with open(badcase_path, 'w', encoding='utf-8') as f:
        # 将结果字典转换为JSON格式并写入文件
        json.dump(result_dict, f, ensure_ascii=False, indent=4)
-    badcase_path_res = "文本型PDF抽取方案评测badcase输出报告查看：" + badcase_path
+    final_badcase_path = upload_to_s3(badcase_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    badcase_path_res = "文本型PDF抽取方案评测badcase输出报告查看：" + final_badcase_path
    print(f'\033[31m{badcase_path_res}\033[0m')

-    with open(overall_path, 'w', encoding='utf-8') as f:
-    # 将结果字典转换为JSON格式并写入文件
-        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
-    overall_path_res = "文本型PDF抽取方案整体评测指标结果请查看：" + overall_path
-    print(f'\033[31m{overall_path_res}\033[0m')
+

    
 def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL):
@@ -805,8 +808,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
        
        # 上传文件到S3
        s3.upload_file(file_path, bucket_name, s3_object_key)
-        
-        print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory}，文件名为 {file_name}")
+        s3_path = f"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
+        return s3_path
+        #print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory}，文件名为 {file_name}")
    except FileNotFoundError:
        print(f"文件 {file_path} 未找到，请检查文件路径是否正确。")
    except NoCredentialsError:
@@ -875,17 +879,17 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
    badcase_file,overall_file = generate_filename(badcase_path,overall_path)

    # 保存结果到JSON文件
-    save_results(result_dict, overall_report_dict,badcase_file,overall_file)
+    save_results(result_dict, overall_report_dict,badcase_file,overall_file,  s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)

    result=compare_edit_distance(base_data_path, overall_report_dict)
-
+    """
    if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
        try:
            upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
            upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
        except Exception as e:
            print(f"上传到S3时发生错误: {e}")
-    print(result)
+    """
    assert result == 1

 if __name__ == "__main__":