Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
b66dda38
Commit
b66dda38
authored
Apr 08, 2024
by
Shuimo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Improved script to read json in compressed packages
parent
015e2bdd
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
39 additions
and
68 deletions
+39
-68
json_files.zip
tools/json_files.zip
+0
-0
ocr_badcase.py
tools/ocr_badcase.py
+39
-18
pdf_json_label_0306.json
tools/pdf_json_label_0306.json
+0
-50
No files found.
tools/
ocr_dataset.json
→
tools/
json_files.zip
View file @
b66dda38
No preview for this file type
tools/ocr_badcase.py
View file @
b66dda38
...
...
@@ -9,7 +9,8 @@ from sklearn import metrics
from
datetime
import
datetime
import
boto3
from
botocore.exceptions
import
NoCredentialsError
,
ClientError
from
io
import
TextIOWrapper
import
zipfile
...
...
@@ -429,28 +430,46 @@ def handle_multi_deletion(test_page, test_page_tag, test_page_bbox, standard_pag
def
check_
files_exist
(
standard_file
,
test_file
):
def
check_
json_files_in_zip_exist
(
zip_file_path
,
standard_json_path_in_zip
,
test_json_path_in_zip
):
"""
检查
文件是否存在
检查
ZIP文件中是否存在指定的JSON文件
"""
if
not
os
.
path
.
isfile
(
standard_file
)
or
not
os
.
path
.
isfile
(
test_file
):
raise
FileNotFoundError
(
"One or both of the required JSON files are missing."
)
with
zipfile
.
ZipFile
(
zip_file_path
,
'r'
)
as
z
:
# 获取ZIP文件中所有文件的列表
all_files_in_zip
=
z
.
namelist
()
# 检查标准文件和测试文件是否都在ZIP文件中
if
standard_json_path_in_zip
not
in
all_files_in_zip
or
test_json_path_in_zip
not
in
all_files_in_zip
:
raise
FileNotFoundError
(
"One or both of the required JSON files are missing from the ZIP archive."
)
def
read_json_files
(
standard_file
,
test_file
):
def
read_json_files
_from_streams
(
standard_file_stream
,
test_file_stream
):
"""
读取JSON文件内容
从文件流中
读取JSON文件内容
"""
with
open
(
standard_file
,
'r'
,
encoding
=
'utf-8'
)
as
sf
:
pdf_json_standard
=
[
json
.
loads
(
line
)
for
line
in
sf
]
with
open
(
test_file
,
'r'
,
encoding
=
'utf-8'
)
as
tf
:
pdf_json_test
=
[
json
.
loads
(
line
)
for
line
in
tf
]
pdf_json_standard
=
[
json
.
loads
(
line
)
for
line
in
standard_file_stream
]
pdf_json_test
=
[
json
.
loads
(
line
)
for
line
in
test_file_stream
]
json_standard_origin
=
pd
.
DataFrame
(
pdf_json_standard
)
json_test
=
pd
.
DataFrame
(
pdf_json_test
)
json_test_origin
=
pd
.
DataFrame
(
pdf_json_test
)
return
json_standard_origin
,
json_test_origin
def
read_json_files_from_zip
(
zip_file_path
,
standard_json_path_in_zip
,
test_json_path_in_zip
):
"""
从ZIP文件中读取两个JSON文件并返回它们的DataFrame
"""
with
zipfile
.
ZipFile
(
zip_file_path
,
'r'
)
as
z
:
with
z
.
open
(
standard_json_path_in_zip
)
as
standard_file_stream
,
\
z
.
open
(
test_json_path_in_zip
)
as
test_file_stream
:
return
json_standard_origin
,
json_test
standard_file_text_stream
=
TextIOWrapper
(
standard_file_stream
,
encoding
=
'utf-8'
)
test_file_text_stream
=
TextIOWrapper
(
test_file_stream
,
encoding
=
'utf-8'
)
json_standard_origin
,
json_test_origin
=
read_json_files_from_streams
(
standard_file_text_stream
,
test_file_text_stream
)
return
json_standard_origin
,
json_test_origin
def
merge_json_data
(
json_test_df
,
json_standard_df
):
...
...
@@ -634,23 +653,24 @@ def generate_output_filename(base_path):
def
main
(
standard_file
,
test_file
,
base_output_path
,
s3_bucket_name
=
None
,
s3_file_name
=
None
,
AWS_ACCESS_KEY
=
None
,
AWS_SECRET_KEY
=
None
,
END_POINT_URL
=
None
):
def
main
(
standard_file
,
test_file
,
zip_file
,
base_output_path
,
s3_bucket_name
=
None
,
s3_file_name
=
None
,
AWS_ACCESS_KEY
=
None
,
AWS_SECRET_KEY
=
None
,
END_POINT_URL
=
None
):
"""
主函数,执行整个评估流程。
参数:
- standard_file: 标准文件的路径。
- test_file: 测试文件的路径。
- zip_file: 压缩包的路径的路径。
- base_output_path: 结果文件的基础路径和文件名前缀。
- s3_bucket_name: S3桶名称(可选)。
- s3_file_name: S3上的文件名(可选)。
- AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL: AWS访问凭证和端点URL(可选)。
"""
# 检查文件是否存在
check_
files_exist
(
standard_file
,
test_file
)
check_
json_files_in_zip_exist
(
zip_file
,
standard_file
,
test_file
)
# 读取JSON文件内容
json_standard_origin
,
json_test_origin
=
read_json_files
(
standard_file
,
test_file
)
json_standard_origin
,
json_test_origin
=
read_json_files
_from_zip
(
zip_file
,
standard_file
,
test_file
)
# 合并JSON数据
inner_merge
,
standard_exist
,
test_exist
=
merge_json_data
(
json_test_origin
,
json_standard_origin
)
...
...
@@ -668,6 +688,7 @@ if __name__ == "__main__":
parser
=
argparse
.
ArgumentParser
(
description
=
"主函数,执行整个评估流程。"
)
parser
.
add_argument
(
'standard_file'
,
type
=
str
,
help
=
'标准文件的路径。'
)
parser
.
add_argument
(
'test_file'
,
type
=
str
,
help
=
'测试文件的路径。'
)
parser
.
add_argument
(
'zip_file'
,
type
=
str
,
help
=
'压缩包的路径。'
)
parser
.
add_argument
(
'base_output_path'
,
type
=
str
,
help
=
'结果文件的基础路径和文件名前缀。'
)
parser
.
add_argument
(
'--s3_bucket_name'
,
type
=
str
,
help
=
'S3桶名称。'
,
default
=
None
)
parser
.
add_argument
(
'--s3_file_name'
,
type
=
str
,
help
=
'S3上的文件名。'
,
default
=
None
)
...
...
@@ -677,5 +698,5 @@ if __name__ == "__main__":
args
=
parser
.
parse_args
()
main
(
args
.
standard_file
,
args
.
test_file
,
args
.
base_output_path
,
args
.
s3_bucket_name
,
args
.
s3_file_name
,
args
.
AWS_ACCESS_KEY
,
args
.
AWS_SECRET_KEY
,
args
.
END_POINT_URL
)
main
(
args
.
standard_file
,
args
.
test_file
,
args
.
zip_file
,
args
.
base_output_path
,
args
.
s3_bucket_name
,
args
.
s3_file_name
,
args
.
AWS_ACCESS_KEY
,
args
.
AWS_SECRET_KEY
,
args
.
END_POINT_URL
)
tools/pdf_json_label_0306.json
deleted
100644 → 0
View file @
015e2bdd
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment