Improved script to read json in compressed packages

b66dda38 · Shuimo · 015e2bdd · b66dda38 · b66dda38 · 015e2bdd
Commit b66dda38 authored Apr 08, 2024 by Shuimo
Show whitespace changes
Inline Side-by-side

Showing with 39 additions and 68 deletions

json_files.zip tools/json_files.zip +0 -0

ocr_badcase.py tools/ocr_badcase.py +39 -18

pdf_json_label_0306.json tools/pdf_json_label_0306.json +0 -50

No files found.
--- a/tools/ocr_dataset.json
+++ b/tools/ocr_dataset.json
--- a/tools/ocr_badcase.py
+++ b/tools/ocr_badcase.py
@@ -9,7 +9,8 @@ from sklearn import metrics
 from datetime import datetime
 import boto3
 from botocore.exceptions import NoCredentialsError, ClientError
+from io import TextIOWrapper
+import zipfile
@@ -429,28 +430,46 @@ def handle_multi_deletion(test_page, test_page_tag, test_page_bbox, standard_pag
-def check_files_exist(standard_file, test_file):
+def check_json_files_in_zip_exist(zip_file_path, standard_json_path_in_zip, test_json_path_in_zip):
    """
-    检查文件是否存在
+    检查ZIP文件中是否存在指定的JSON文件
    """
-    if not os.path.isfile(standard_file) or not os.path.isfile(test_file):
+    with zipfile.ZipFile(zip_file_path, 'r') as z:
-        raise FileNotFoundError("One or both of the required JSON files are missing.")
+        # 获取ZIP文件中所有文件的列表
+        all_files_in_zip = z.namelist()
+        # 检查标准文件和测试文件是否都在ZIP文件中
+        if standard_json_path_in_zip not in all_files_in_zip or test_json_path_in_zip not in all_files_in_zip:
+            raise FileNotFoundError("One or both of the required JSON files are missing from the ZIP archive.")
-def read_json_files(standard_file, test_file):
+def read_json_files_from_streams(standard_file_stream, test_file_stream):
    """
-    读取JSON文件内容
+    从文件流中读取JSON文件内容
    """
-    with open(standard_file, 'r', encoding='utf-8') as sf:
+    pdf_json_standard = [json.loads(line) for line in standard_file_stream]
-        pdf_json_standard = [json.loads(line) for line in sf]
+    pdf_json_test = [json.loads(line) for line in test_file_stream]
-    with open(test_file, 'r', encoding='utf-8') as tf:
-        pdf_json_test = [json.loads(line) for line in tf]
    json_standard_origin = pd.DataFrame(pdf_json_standard)
-    json_test = pd.DataFrame(pdf_json_test)
+    json_test_origin = pd.DataFrame(pdf_json_test)
+    return json_standard_origin, json_test_origin
+def read_json_files_from_zip(zip_file_path, standard_json_path_in_zip, test_json_path_in_zip):
+    """
+    从ZIP文件中读取两个JSON文件并返回它们的DataFrame
+    """
+    with zipfile.ZipFile(zip_file_path, 'r') as z:
+        with z.open(standard_json_path_in_zip) as standard_file_stream, \
+             z.open(test_json_path_in_zip) as test_file_stream:
+            standard_file_text_stream = TextIOWrapper(standard_file_stream, encoding='utf-8')
+            test_file_text_stream = TextIOWrapper(test_file_stream, encoding='utf-8')
+            json_standard_origin, json_test_origin = read_json_files_from_streams(
+                standard_file_text_stream, test_file_text_stream
+            )
-    return json_standard_origin, json_test
+    return json_standard_origin, json_test_origin
 def merge_json_data(json_test_df, json_standard_df):
@@ -634,23 +653,24 @@ def generate_output_filename(base_path):
-def main(standard_file, test_file, base_output_path, s3_bucket_name=None, s3_file_name=None, AWS_ACCESS_KEY=None, AWS_SECRET_KEY=None, END_POINT_URL=None):
+def main(standard_file, test_file, zip_file, base_output_path, s3_bucket_name=None, s3_file_name=None, AWS_ACCESS_KEY=None, AWS_SECRET_KEY=None, END_POINT_URL=None):
    """
    主函数，执行整个评估流程。
    参数:
    - standard_file: 标准文件的路径。
    - test_file: 测试文件的路径。
+    - zip_file: 压缩包的路径的路径。
    - base_output_path: 结果文件的基础路径和文件名前缀。
    - s3_bucket_name: S3桶名称（可选）。
    - s3_file_name: S3上的文件名（可选）。
    - AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL: AWS访问凭证和端点URL（可选）。
    """
    # 检查文件是否存在
-    check_files_exist(standard_file, test_file)
+    check_json_files_in_zip_exist(zip_file, standard_file, test_file)
    # 读取JSON文件内容
-    json_standard_origin, json_test_origin = read_json_files(standard_file, test_file)
+    json_standard_origin, json_test_origin = read_json_files_from_zip(zip_file, standard_file, test_file)
    # 合并JSON数据
    inner_merge, standard_exist, test_exist = merge_json_data(json_test_origin, json_standard_origin)
@@ -668,6 +688,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="主函数，执行整个评估流程。")
    parser.add_argument('standard_file', type=str, help='标准文件的路径。')
    parser.add_argument('test_file', type=str, help='测试文件的路径。')
+    parser.add_argument('zip_file', type=str, help='压缩包的路径。')
    parser.add_argument('base_output_path', type=str, help='结果文件的基础路径和文件名前缀。')
    parser.add_argument('--s3_bucket_name', type=str, help='S3桶名称。', default=None)
    parser.add_argument('--s3_file_name', type=str, help='S3上的文件名。', default=None)
@@ -677,5 +698,5 @@ if __name__ == "__main__":
    args = parser.parse_args()
-    main(args.standard_file, args.test_file, args.base_output_path, args.s3_bucket_name, args.s3_file_name, args.AWS_ACCESS_KEY, args.AWS_SECRET_KEY, args.END_POINT_URL)
+    main(args.standard_file, args.test_file, args.zip_file, args.base_output_path, args.s3_bucket_name, args.s3_file_name, args.AWS_ACCESS_KEY, args.AWS_SECRET_KEY, args.END_POINT_URL)
--- a/tools/pdf_json_label_0306.json
+++ b/tools/pdf_json_label_0306.json