Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
cf3e8519
Commit
cf3e8519
authored
Apr 15, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/master'
# Conflicts: # magic_pdf/libs/pdf_image_tools.py
parents
7b937d58
d867304f
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
54 additions
and
37 deletions
+54
-37
benchmark.yml
.github/workflows/benchmark.yml
+7
-4
magicpdf.py
magic_pdf/cli/magicpdf.py
+2
-2
DiskReaderWriter.py
magic_pdf/io/DiskReaderWriter.py
+5
-2
config_reader.py
magic_pdf/libs/config_reader.py
+1
-1
pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+1
-1
README.MD
tools/README.MD
+2
-0
ocr_badcase.py
tools/ocr_badcase.py
+20
-15
text_badcase.py
tools/text_badcase.py
+16
-12
No files found.
.github/workflows/benchmark.yml
View file @
cf3e8519
...
...
@@ -40,12 +40,15 @@ jobs:
pip install -r requirements.txt
fi
-
name
:
benchmark
-
name
:
config-net-reset
run
:
|
export http_proxy=""
export https_proxy=""
-
name
:
get-benchmark-result
run
:
|
echo "start test"
cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_badcase text_overall base_data_text.json
python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_badcase ocr_overall base_data_ocr.json
cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_badcase text_overall base_data_text.json
--s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_badcase ocr_overall base_data_ocr.json
--s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
notify_to_feishu
:
if
:
${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
...
...
magic_pdf/cli/magicpdf.py
View file @
cf3e8519
...
...
@@ -50,7 +50,7 @@ def get_pdf_parse_method(method):
def
prepare_env
():
local_parent_dir
=
os
.
path
.
join
(
get_local_dir
(),
"magic-pdf"
,
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
-
%
H-
%
M-
%
S
"
)
get_local_dir
(),
"magic-pdf"
,
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d"
)
)
local_image_dir
=
os
.
path
.
join
(
local_parent_dir
,
"images"
)
...
...
@@ -132,7 +132,7 @@ def pdf_command(pdf, model, method):
local_image_dir
,
_
=
prepare_env
()
local_image_rw
=
DiskReaderWriter
(
local_image_dir
)
parse
=
get_pdf_parse_method
(
method
)
parse
(
pdf_data
,
jso
[
"doc_layout_result"
]
,
local_image_rw
,
is_debug
=
True
)
parse
(
pdf_data
,
jso
,
local_image_rw
,
is_debug
=
True
)
if
__name__
==
"__main__"
:
...
...
magic_pdf/io/DiskReaderWriter.py
View file @
cf3e8519
...
...
@@ -35,6 +35,9 @@ class DiskReaderWriter(AbsReaderWriter):
abspath
=
path
else
:
abspath
=
os
.
path
.
join
(
self
.
path
,
path
)
directory_path
=
os
.
path
.
dirname
(
abspath
)
if
not
os
.
path
.
exists
(
directory_path
):
os
.
makedirs
(
directory_path
)
if
mode
==
MODE_TXT
:
with
open
(
abspath
,
"w"
,
encoding
=
self
.
encoding
)
as
f
:
f
.
write
(
content
)
...
...
@@ -53,11 +56,11 @@ class DiskReaderWriter(AbsReaderWriter):
# 使用示例
if
__name__
==
"__main__"
:
file_path
=
"io/example.txt"
file_path
=
"io/
test/
example.txt"
drw
=
DiskReaderWriter
(
"D:
\
projects
\
papayfork
\
Magic-PDF
\
magic_pdf"
)
# 写入内容到文件
drw
.
write
(
b
"Hello, World!"
,
path
=
"io/example.txt"
,
mode
=
"binary"
)
drw
.
write
(
b
"Hello, World!"
,
path
=
"io/
test/
example.txt"
,
mode
=
"binary"
)
# 从文件读取内容
content
=
drw
.
read
(
path
=
file_path
)
...
...
magic_pdf/libs/config_reader.py
View file @
cf3e8519
...
...
@@ -17,7 +17,7 @@ def read_config():
config_file
=
os
.
path
.
join
(
home_dir
,
"magic-pdf.json"
)
if
not
os
.
path
.
exists
(
config_file
):
raise
Exception
(
"magic-pdf.json
not found"
)
raise
Exception
(
f
"{config_file}
not found"
)
with
open
(
config_file
,
"r"
)
as
f
:
config
=
json
.
load
(
f
)
...
...
magic_pdf/libs/pdf_image_tools.py
View file @
cf3e8519
...
...
@@ -5,7 +5,7 @@ from magic_pdf.libs.commons import join_path
from
magic_pdf.libs.hash_utils
import
compute_sha256
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
):
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
:
AbsReaderWriter
):
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
...
...
tools/README.MD
0 → 100644
View file @
cf3e8519
# 工具脚本使用说明
tools/ocr_badcase.py
View file @
cf3e8519
...
...
@@ -756,7 +756,7 @@ def merge_json_data(json_test_df, json_standard_df):
return
inner_merge
,
standard_exist
,
test_exist
def
save_results
(
result_dict
,
overall_report_dict
,
badcase_path
,
overall_path
,):
def
save_results
(
result_dict
,
overall_report_dict
,
badcase_path
,
overall_path
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
):
"""
将结果字典保存为JSON文件至指定路径。
...
...
@@ -764,18 +764,19 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
- result_dict: 包含计算结果的字典。
- overall_path: 结果文件的保存路径,包括文件名。
"""
with
open
(
overall_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
# 将结果字典转换为JSON格式并写入文件
json
.
dump
(
overall_report_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
final_overall_path
=
upload_to_s3
(
overall_path
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
overall_path_res
=
"OCR抽取方案整体评测指标结果请查看:"
+
final_overall_path
print
(
f
'
\033
[31m{overall_path_res}
\033
[0m'
)
# 打开指定的文件以写入
with
open
(
badcase_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
# 将结果字典转换为JSON格式并写入文件
json
.
dump
(
result_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
print
(
f
"计算结果已经保存到文件:{badcase_path}"
)
with
open
(
overall_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
# 将结果字典转换为JSON格式并写入文件
json
.
dump
(
overall_report_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
print
(
f
"计算结果已经保存到文件:{overall_path}"
)
final_badcase_path
=
upload_to_s3
(
badcase_path
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
badcase_path_res
=
"OCR抽取方案评测badcase输出报告查看:"
+
final_badcase_path
print
(
f
'
\033
[31m{badcase_path_res}
\033
[0m'
)
def
upload_to_s3
(
file_path
,
bucket_name
,
s3_directory
,
AWS_ACCESS_KEY
,
AWS_SECRET_KEY
,
END_POINT_URL
):
"""
...
...
@@ -792,8 +793,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
# 上传文件到S3
s3
.
upload_file
(
file_path
,
bucket_name
,
s3_object_key
)
print
(
f
"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}"
)
s3_path
=
f
"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
return
s3_path
#print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
except
FileNotFoundError
:
print
(
f
"文件 {file_path} 未找到,请检查文件路径是否正确。"
)
except
NoCredentialsError
:
...
...
@@ -801,6 +803,7 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
except
ClientError
as
e
:
print
(
f
"上传文件时发生错误:{e}"
)
def
generate_filename
(
badcase_path
,
overall_path
):
"""
生成带有当前时间戳的输出文件名。
...
...
@@ -864,17 +867,19 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
badcase_file
,
overall_file
=
generate_filename
(
badcase_path
,
overall_path
)
# 保存结果到JSON文件
save_results
(
result_dict
,
overall_report_dict
,
badcase_file
,
overall_file
)
#save_results(result_dict, overall_report_dict,badcase_file,overall_file)
save_results
(
result_dict
,
overall_report_dict
,
badcase_file
,
overall_file
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
result
=
compare_edit_distance
(
base_data_path
,
overall_report_dict
)
"""
if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
try:
upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
except Exception as e:
print
(
f
"上传到S3时发生错误: {e}"
)
print
(
result
)
print(f"上传到S3时发生错误: {e}")
"""
#print(result)
assert
result
==
1
if
__name__
==
"__main__"
:
...
...
tools/text_badcase.py
View file @
cf3e8519
...
...
@@ -768,7 +768,7 @@ def merge_json_data(json_test_df, json_standard_df):
return
inner_merge
,
standard_exist
,
test_exist
def
save_results
(
result_dict
,
overall_report_dict
,
badcase_path
,
overall_path
,):
def
save_results
(
result_dict
,
overall_report_dict
,
badcase_path
,
overall_path
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
):
"""
将结果字典保存为JSON文件至指定路径。
...
...
@@ -776,18 +776,21 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
- result_dict: 包含计算结果的字典。
- overall_path: 结果文件的保存路径,包括文件名。
"""
with
open
(
overall_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
# 将结果字典转换为JSON格式并写入文件
json
.
dump
(
overall_report_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
final_overall_path
=
upload_to_s3
(
overall_path
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
overall_path_res
=
"文本型PDF抽取方案整体评测指标结果请查看:"
+
final_overall_path
print
(
f
'
\033
[31m{overall_path_res}
\033
[0m'
)
# 打开指定的文件以写入
with
open
(
badcase_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
# 将结果字典转换为JSON格式并写入文件
json
.
dump
(
result_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
final_badcase_path
=
upload_to_s3
(
badcase_path
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
badcase_path_res
=
"文本型PDF抽取方案评测badcase输出报告查看:"
+
final_badcase_path
print
(
f
'
\033
[31m{badcase_path_res}
\033
[0m'
)
print
(
f
"计算结果已经保存到文件:{badcase_path}"
)
with
open
(
overall_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
# 将结果字典转换为JSON格式并写入文件
json
.
dump
(
overall_report_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
print
(
f
"计算结果已经保存到文件:{overall_path}"
)
def
upload_to_s3
(
file_path
,
bucket_name
,
s3_directory
,
AWS_ACCESS_KEY
,
AWS_SECRET_KEY
,
END_POINT_URL
):
...
...
@@ -805,8 +808,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
# 上传文件到S3
s3
.
upload_file
(
file_path
,
bucket_name
,
s3_object_key
)
print
(
f
"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}"
)
s3_path
=
f
"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
return
s3_path
#print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
except
FileNotFoundError
:
print
(
f
"文件 {file_path} 未找到,请检查文件路径是否正确。"
)
except
NoCredentialsError
:
...
...
@@ -875,17 +879,17 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
badcase_file
,
overall_file
=
generate_filename
(
badcase_path
,
overall_path
)
# 保存结果到JSON文件
save_results
(
result_dict
,
overall_report_dict
,
badcase_file
,
overall_file
)
save_results
(
result_dict
,
overall_report_dict
,
badcase_file
,
overall_file
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
result
=
compare_edit_distance
(
base_data_path
,
overall_report_dict
)
"""
if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
try:
upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
except Exception as e:
print(f"上传到S3时发生错误: {e}")
print
(
result
)
"""
assert
result
==
1
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment