Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
194f34af
Commit
194f34af
authored
Apr 15, 2024
by
quyuan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
CI yaml
parent
a6b22ae5
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
35 additions
and
201 deletions
+35
-201
base_data_ocr.json
tools/base_data_ocr.json
+0
-87
base_data_text.json
tools/base_data_text.json
+0
-88
ocr_badcase.py
tools/ocr_badcase.py
+19
-14
text_badcase.py
tools/text_badcase.py
+16
-12
No files found.
tools/base_data_ocr.json
deleted
100644 → 0
View file @
a6b22ae5
{
"accuracy"
:
1.0
,
"precision"
:
1.0
,
"recall"
:
1.0
,
"f1_score"
:
1.0
,
"pdf间的平均编辑距离"
:
133.10256410256412
,
"pdf间的平均bleu"
:
0.28838311595434046
,
"分段准确率"
:
0.07220216606498195
,
"行内公式准确率"
:
{
"accuracy"
:
0.004835727492533068
,
"precision"
:
0.008790072388831437
,
"recall"
:
0.010634970284641852
,
"f1_score"
:
0.009624911535739562
},
"行内公式编辑距离"
:
1.6176470588235294
,
"行内公式bleu"
:
0.17154724654721457
,
"行间公式准确率"
:
{
"accuracy"
:
0.08490566037735849
,
"precision"
:
0.1836734693877551
,
"recall"
:
0.13636363636363635
,
"f1_score"
:
0.1565217391304348
},
"行间公式编辑距离"
:
113.22222222222223
,
"行间公式bleu"
:
0.2531053359913409
,
"丢弃文本准确率"
:
{
"accuracy"
:
0.00035398230088495576
,
"precision"
:
0.0006389776357827476
,
"recall"
:
0.0007930214115781126
,
"f1_score"
:
0.0007077140835102619
},
"丢弃文本标签准确率"
:
{
"color_background_header_txt_block"
:
{
"precision"
:
0.0
,
"recall"
:
0.0
,
"f1-score"
:
0.0
,
"support"
:
41.0
},
"header"
:
{
"precision"
:
0.0
,
"recall"
:
0.0
,
"f1-score"
:
0.0
,
"support"
:
4.0
},
"footnote"
:
{
"precision"
:
1.0
,
"recall"
:
0.009708737864077669
,
"f1-score"
:
0.019230769230769232
,
"support"
:
103.0
},
"on-table"
:
{
"precision"
:
0.0
,
"recall"
:
0.0
,
"f1-score"
:
0.0
,
"support"
:
665.0
},
"rotate"
:
{
"precision"
:
0.0
,
"recall"
:
0.0
,
"f1-score"
:
0.0
,
"support"
:
63.0
},
"on-image"
:
{
"precision"
:
0.0
,
"recall"
:
0.0
,
"f1-score"
:
0.0
,
"support"
:
380.0
},
"micro avg"
:
{
"precision"
:
1.0
,
"recall"
:
0.0007961783439490446
,
"f1-score"
:
0.0015910898965791568
,
"support"
:
1256.0
}
},
"丢弃图片准确率"
:
{
"accuracy"
:
0.0
,
"precision"
:
0.0
,
"recall"
:
0.0
,
"f1_score"
:
0.0
},
"丢弃表格准确率"
:
{
"accuracy"
:
0.0
,
"precision"
:
0.0
,
"recall"
:
0.0
,
"f1_score"
:
0.0
}
}
\ No newline at end of file
tools/base_data_text.json
deleted
100644 → 0
View file @
a6b22ae5
{
"accuracy"
:
1.0
,
"precision"
:
1.0
,
"recall"
:
1.0
,
"f1_score"
:
1.0
,
"pdf间的平均编辑距离"
:
19.82051282051282
,
"pdf间的平均bleu"
:
0.9002485609584511
,
"阅读顺序编辑距离"
:
0.3176895306859206
,
"分段准确率"
:
0.8989169675090253
,
"行内公式准确率"
:
{
"accuracy"
:
0.9782741738066095
,
"precision"
:
0.9782741738066095
,
"recall"
:
1.0
,
"f1_score"
:
0.9890177880897139
},
"行内公式编辑距离"
:
0.0
,
"行内公式bleu"
:
0.20340450120213166
,
"行间公式准确率"
:
{
"accuracy"
:
1.0
,
"precision"
:
1.0
,
"recall"
:
1.0
,
"f1_score"
:
1.0
},
"行间公式编辑距离"
:
0.0
,
"行间公式bleu"
:
0.3662262622386575
,
"丢弃文本准确率"
:
{
"accuracy"
:
0.867870036101083
,
"precision"
:
0.9064856711915535
,
"recall"
:
0.9532117367168914
,
"f1_score"
:
0.9292616930807885
},
"丢弃文本标签准确率"
:
{
"color_background_header_txt_block"
:
{
"precision"
:
0.0
,
"recall"
:
0.0
,
"f1-score"
:
0.0
,
"support"
:
41.0
},
"rotate"
:
{
"precision"
:
1.0
,
"recall"
:
0.9682539682539683
,
"f1-score"
:
0.9838709677419355
,
"support"
:
63.0
},
"footnote"
:
{
"precision"
:
1.0
,
"recall"
:
0.883495145631068
,
"f1-score"
:
0.9381443298969072
,
"support"
:
103.0
},
"header"
:
{
"precision"
:
1.0
,
"recall"
:
1.0
,
"f1-score"
:
1.0
,
"support"
:
4.0
},
"on-image"
:
{
"precision"
:
0.9947643979057592
,
"recall"
:
1.0
,
"f1-score"
:
0.9973753280839895
,
"support"
:
380.0
},
"on-table"
:
{
"precision"
:
1.0
,
"recall"
:
0.9443609022556391
,
"f1-score"
:
0.97138437741686
,
"support"
:
665.0
},
"micro avg"
:
{
"precision"
:
0.9982847341337907
,
"recall"
:
0.9267515923566879
,
"f1-score"
:
0.9611890999174236
,
"support"
:
1256.0
}
},
"丢弃图片准确率"
:
{
"accuracy"
:
0.8666666666666667
,
"precision"
:
0.9285714285714286
,
"recall"
:
0.9285714285714286
,
"f1_score"
:
0.9285714285714286
},
"丢弃表格准确率"
:
{
"accuracy"
:
0
,
"precision"
:
0
,
"recall"
:
0
,
"f1_score"
:
0
}
}
\ No newline at end of file
tools/ocr_badcase.py
View file @
194f34af
...
...
@@ -756,7 +756,7 @@ def merge_json_data(json_test_df, json_standard_df):
return
inner_merge
,
standard_exist
,
test_exist
def
save_results
(
result_dict
,
overall_report_dict
,
badcase_path
,
overall_path
,):
def
save_results
(
result_dict
,
overall_report_dict
,
badcase_path
,
overall_path
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
):
"""
将结果字典保存为JSON文件至指定路径。
...
...
@@ -764,19 +764,20 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
- result_dict: 包含计算结果的字典。
- overall_path: 结果文件的保存路径,包括文件名。
"""
with
open
(
overall_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
# 将结果字典转换为JSON格式并写入文件
json
.
dump
(
overall_report_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
final_overall_path
=
upload_to_s3
(
overall_path
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
overall_path_res
=
"OCR抽取方案整体评测指标结果请查看:"
+
final_overall_path
print
(
f
'
\033
[31m{overall_path_res}
\033
[0m'
)
# 打开指定的文件以写入
with
open
(
badcase_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
# 将结果字典转换为JSON格式并写入文件
json
.
dump
(
result_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
badcase_path_res
=
"OCR抽取方案评测badcase输出报告查看:"
+
badcase_path
final_badcase_path
=
upload_to_s3
(
badcase_path
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
badcase_path_res
=
"OCR抽取方案评测badcase输出报告查看:"
+
final_badcase_path
print
(
f
'
\033
[31m{badcase_path_res}
\033
[0m'
)
with
open
(
overall_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
# 将结果字典转换为JSON格式并写入文件
json
.
dump
(
overall_report_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
overall_path_res
=
"OCR抽取方案整体评测指标结果请查看:"
+
overall_path
print
(
f
'
\033
[31m{overall_path_res}
\033
[0m'
)
def
upload_to_s3
(
file_path
,
bucket_name
,
s3_directory
,
AWS_ACCESS_KEY
,
AWS_SECRET_KEY
,
END_POINT_URL
):
"""
上传文件到Amazon S3
...
...
@@ -792,8 +793,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
# 上传文件到S3
s3
.
upload_file
(
file_path
,
bucket_name
,
s3_object_key
)
print
(
f
"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}"
)
s3_path
=
f
"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
return
s3_path
#print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
except
FileNotFoundError
:
print
(
f
"文件 {file_path} 未找到,请检查文件路径是否正确。"
)
except
NoCredentialsError
:
...
...
@@ -801,6 +803,7 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
except
ClientError
as
e
:
print
(
f
"上传文件时发生错误:{e}"
)
def
generate_filename
(
badcase_path
,
overall_path
):
"""
生成带有当前时间戳的输出文件名。
...
...
@@ -864,17 +867,19 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
badcase_file
,
overall_file
=
generate_filename
(
badcase_path
,
overall_path
)
# 保存结果到JSON文件
save_results
(
result_dict
,
overall_report_dict
,
badcase_file
,
overall_file
)
#save_results(result_dict, overall_report_dict,badcase_file,overall_file)
save_results
(
result_dict
,
overall_report_dict
,
badcase_file
,
overall_file
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
result
=
compare_edit_distance
(
base_data_path
,
overall_report_dict
)
"""
if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
try:
upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
except Exception as e:
print(f"上传到S3时发生错误: {e}")
print
(
result
)
"""
#print(result)
assert
result
==
1
if
__name__
==
"__main__"
:
...
...
tools/text_badcase.py
View file @
194f34af
...
...
@@ -768,7 +768,7 @@ def merge_json_data(json_test_df, json_standard_df):
return
inner_merge
,
standard_exist
,
test_exist
def
save_results
(
result_dict
,
overall_report_dict
,
badcase_path
,
overall_path
,):
def
save_results
(
result_dict
,
overall_report_dict
,
badcase_path
,
overall_path
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
):
"""
将结果字典保存为JSON文件至指定路径。
...
...
@@ -776,18 +776,21 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
- result_dict: 包含计算结果的字典。
- overall_path: 结果文件的保存路径,包括文件名。
"""
with
open
(
overall_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
# 将结果字典转换为JSON格式并写入文件
json
.
dump
(
overall_report_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
final_overall_path
=
upload_to_s3
(
overall_path
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
overall_path_res
=
"文本型PDF抽取方案整体评测指标结果请查看:"
+
final_overall_path
print
(
f
'
\033
[31m{overall_path_res}
\033
[0m'
)
# 打开指定的文件以写入
with
open
(
badcase_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
# 将结果字典转换为JSON格式并写入文件
json
.
dump
(
result_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
badcase_path_res
=
"文本型PDF抽取方案评测badcase输出报告查看:"
+
badcase_path
final_badcase_path
=
upload_to_s3
(
badcase_path
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
badcase_path_res
=
"文本型PDF抽取方案评测badcase输出报告查看:"
+
final_badcase_path
print
(
f
'
\033
[31m{badcase_path_res}
\033
[0m'
)
with
open
(
overall_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
# 将结果字典转换为JSON格式并写入文件
json
.
dump
(
overall_report_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
overall_path_res
=
"文本型PDF抽取方案整体评测指标结果请查看:"
+
overall_path
print
(
f
'
\033
[31m{overall_path_res}
\033
[0m'
)
def
upload_to_s3
(
file_path
,
bucket_name
,
s3_directory
,
AWS_ACCESS_KEY
,
AWS_SECRET_KEY
,
END_POINT_URL
):
...
...
@@ -805,8 +808,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
# 上传文件到S3
s3
.
upload_file
(
file_path
,
bucket_name
,
s3_object_key
)
print
(
f
"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}"
)
s3_path
=
f
"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
return
s3_path
#print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
except
FileNotFoundError
:
print
(
f
"文件 {file_path} 未找到,请检查文件路径是否正确。"
)
except
NoCredentialsError
:
...
...
@@ -875,17 +879,17 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
badcase_file
,
overall_file
=
generate_filename
(
badcase_path
,
overall_path
)
# 保存结果到JSON文件
save_results
(
result_dict
,
overall_report_dict
,
badcase_file
,
overall_file
)
save_results
(
result_dict
,
overall_report_dict
,
badcase_file
,
overall_file
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
result
=
compare_edit_distance
(
base_data_path
,
overall_report_dict
)
"""
if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
try:
upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
except Exception as e:
print(f"上传到S3时发生错误: {e}")
print
(
result
)
"""
assert
result
==
1
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment