Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
229dc3c7
Commit
229dc3c7
authored
Jun 26, 2024
by
quyuan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add tool link
parent
d2e250ce
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
11 deletions
+12
-11
benchmark.yml
.github/workflows/benchmark.yml
+1
-2
benchmark.py
tools/benchmark.py
+11
-9
No files found.
.github/workflows/benchmark.yml
View file @
229dc3c7
...
...
@@ -47,8 +47,7 @@ jobs:
-
name
:
get-benchmark-result
run
:
|
echo "start test"
cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_overall base_data_text.json --badcase_path text_badcase --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_overall base_data_ocr.json --badcase_path ocr_badcase --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
cd tools && python benchmark.py
notify_to_feishu
:
if
:
${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
...
...
tools/benchmark.py
View file @
229dc3c7
...
...
@@ -7,22 +7,24 @@ pdf_res_path = "/home/quyuan/code/Magic-PDF/Magic-PDF/Magic-PDF/ci/magic-pdf"
def
test_cli
():
cmd
=
'cd
%
s && export PYTHONPATH=. && find
%
s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py pdf-command --pdf {}'
%
(
code_path
,
pdf_dev_path
)
os
.
system
(
cmd
)
for
annotaion_name
in
os
.
listdir
(
os
.
join
(
pdf_dev_path
,
"output"
)):
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
pdf_dev_path
,
"output"
)):
os
.
makedirs
(
os
.
path
.
join
(
pdf_dev_path
,
"output"
))
for
annotaion_name
in
os
.
listdir
(
os
.
path
.
join
(
pdf_dev_path
,
"output"
)):
if
annotaion_name
.
endswith
(
'.pdf'
):
for
pdf_res_path
in
os
.
listdir
(
pdf_res_path
):
if
".md"
in
os
.
join
(
pdf_res_path
,
annotaion_name
,
"auto"
):
if
".md"
in
os
.
path
.
join
(
pdf_res_path
,
annotaion_name
,
"auto"
):
prefix
=
annotaion_name
.
split
(
'_'
)[
-
2
]
if
not
os
.
path
.
exists
(
os
.
join
(
pdf_dev_path
,
prefix
)):
os
.
makedirs
(
os
.
join
(
pdf_dev_path
,
prefix
))
shutil
.
copy
(
os
.
join
(
pdf_res_path
,
annotaion_name
,
"auto"
,
annotaion_name
+
".md"
),
os
.
join
(
pdf_dev_path
,
prefix
,
annotaion_name
+
".md"
))
os
.
makedirs
(
os
.
path
.
join
(
pdf_dev_path
,
prefix
))
shutil
.
copy
(
os
.
path
.
join
(
pdf_res_path
,
annotaion_name
,
"auto"
,
annotaion_name
+
".md"
),
os
.
join
(
pdf_dev_path
,
prefix
,
annotaion_name
+
".md"
))
def
calculate_score
():
cmd
=
"cd
%
s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name annotations --download_dir
%
s"
%
pdf_dev_path
cmd
=
"cd
%
s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name annotations --download_dir
%
s"
%
(
code_path
,
pdf_dev_path
)
os
.
system
(
cmd
)
cmd
=
"cd
%
s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir
%
s"
%
(
pdf_dev_path
)
cmd
=
"cd
%
s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir
%
s"
%
(
code_path
,
pdf_dev_path
)
os
.
system
(
cmd
)
cmd
=
"cd
%
s && export PYTHONPATH=. && python tools/markdown_calculate.py --tool_name pdf-command --download_dir
%
s --results
%
s"
%
(
pdf_dev_path
,
os
.
join
(
pdf_dev_path
,
"result.json"
))
cmd
=
"cd
%
s && export PYTHONPATH=. && python tools/markdown_calculate.py --tool_name pdf-command --download_dir
%
s --results
%
s"
%
(
code_path
,
pdf_dev_path
,
os
.
path
.
join
(
pdf_dev_path
,
"result.json"
))
os
.
system
(
cmd
)
...
...
@@ -36,6 +38,6 @@ def extrat_zip(zip_file_path, extract_to_path):
if
__name__
==
"__main__"
:
extrat_zip
(
os
.
join
(
pdf_dev_path
,
'output.zip'
),
os
.
join
(
pdf_dev_path
,
'datasets'
))
extrat_zip
(
os
.
path
.
join
(
pdf_dev_path
,
'output.zip'
),
os
.
path
.
join
(
pdf_dev_path
,
'datasets'
))
test_cli
()
calculate_score
()
\ No newline at end of file
calculate_score
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment