Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
34ed90b7
Commit
34ed90b7
authored
Jun 28, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/master'
parents
f84eb897
d3d7a093
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
36 additions
and
72 deletions
+36
-72
benchmark.yml
.github/workflows/benchmark.yml
+0
-13
README.md
README.md
+1
-1
README_zh-CN.md
README_zh-CN.md
+1
-1
benchmark.py
tools/benchmark.py
+25
-6
markdown_calculate.py
tools/markdown_calculate.py
+9
-51
No files found.
.github/workflows/benchmark.yml
View file @
34ed90b7
...
...
@@ -49,16 +49,3 @@ jobs:
echo "start test"
cd tools && python benchmark.py
notify_to_feishu
:
if
:
${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
needs
:
[
pdf-test
]
runs-on
:
pdf
steps
:
-
name
:
notify
run
:
|
curl ${{ secrets.WEBHOOK_URL }} -H 'Content-Type: application/json' -d '{
"msgtype": "text",
"text": {
"content": "'${{ github.repository }}' GitHubAction Failed!\n 细节请查看:https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"
}
}'
README.md
View file @
34ed90b7
...
...
@@ -48,7 +48,7 @@ https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3
### Submodule Repositories
-
[
pdf-extract-kit
](
https://github.com/wangbinDL/pdf-extract-k
it
)
-
[
PDF-Extract-Kit
](
https://github.com/opendatalab/PDF-Extract-K
it
)
-
[
Miner-PDF-Benchmark
](
https://github.com/opendatalab/Miner-PDF-Benchmark
)
## Getting Started
...
...
README_zh-CN.md
View file @
34ed90b7
...
...
@@ -57,7 +57,7 @@ https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3
### 子模块仓库
-
[
pdf-extract-kit
](
https://github.com/wangbinDL/pdf-extract-kit
)
-
[
PDF-Extract-Kit
](
https://github.com/opendatalab/PDF-Extract-Kit
)
领先的文档分析模型
-
[
Miner-PDF-Benchmark
](
https://github.com/opendatalab/Miner-PDF-Benchmark
)
端到端的PDF文档理解评估套件,专为大规模模型数据场景而设计
...
...
tools/benchmark.py
View file @
34ed90b7
import
zipfile
import
os
import
shutil
import
json
import
markdown_calculate
code_path
=
os
.
environ
.
get
(
'GITHUB_WORKSPACE'
)
#code_path = "/home/quyuan/actions-runner/_work/Magic-PDF/Magic-PDF.bk"
#评测集存放路径
...
...
@@ -34,8 +36,10 @@ def calculate_score():
os
.
system
(
cmd
)
cmd
=
"cd
%
s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir
%
s"
%
(
code_path
,
data_path
)
os
.
system
(
cmd
)
cmd
=
"cd
%
s && export PYTHONPATH=. && python tools/markdown_calculate.py --tool_name magicpdf --download_dir
%
s --results
%
s"
%
(
code_path
,
data_path
,
os
.
path
.
join
(
data_path
,
"result.json"
))
os
.
system
(
cmd
)
score
=
markdown_calculate
.
Scoring
(
os
.
path
.
join
(
data_path
,
"result.json"
))
score
.
calculate_similarity_total
(
"magicpdf"
,
file_types
,
data_path
)
res
=
score
.
summary_scores
()
return
res
def
extrat_zip
(
zip_file_path
,
extract_to_path
):
...
...
@@ -49,9 +53,24 @@ def extrat_zip(zip_file_path, extract_to_path):
def
ci_ben
():
fr
=
open
(
os
.
path
.
join
(
pdf_dev_path
,
"ci"
,
"result.json"
),
"r"
)
.
read
()
if
__name__
==
"__main__"
:
lines
=
fr
.
readlines
()
last_line
=
lines
[
-
1
]
.
strip
()
last_score
=
json
.
loads
(
last_line
)
print
(
"last_score:"
,
last_score
)
last_simscore
=
last_score
[
"average_sim_score"
]
last_editdistance
=
last_score
[
"average_edit_distance"
]
last_bleu
=
last_score
[
"average_bleu_score"
]
extrat_zip
(
os
.
path
.
join
(
pdf_dev_path
,
'output.zip'
),
os
.
path
.
join
(
pdf_dev_path
))
test_cli
()
calculate_score
()
now_score
=
calculate_score
()
print
(
"now_score:"
,
now_score
)
now_simscore
=
now_score
[
"average_sim_score"
]
now_editdistance
=
now_score
[
"average_edit_distance"
]
now_bleu
=
now_score
[
"average_bleu_score"
]
assert
last_simscore
<=
now_simscore
assert
last_editdistance
<=
now_editdistance
assert
last_bleu
<=
now_bleu
if
__name__
==
"__main__"
:
ci_ben
()
tools/markdown_calculate.py
View file @
34ed90b7
...
...
@@ -7,44 +7,16 @@ import re
import
scoring
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
"get directory"
)
parser
.
add_argument
(
'--document_types'
,
nargs
=
'+'
,
choices
=
[
"academic_literature"
,
"atlas"
,
"courseware"
,
"colorful_textbook"
,
"historical_documents"
,
"notes"
,
"ordinary_books"
,
"ordinary_exam_paper"
,
"ordinary_textbook"
,
"research_report"
,
"special_exam_paper"
],
help
=
'Choose one or more document_types'
,
default
=
[
"academic_literature"
,
"atlas"
,
"courseware"
,
"colorful_textbook"
,
"historical_documents"
,
"notes"
,
"ordinary_books"
,
"ordinary_exam_paper"
,
"ordinary_textbook"
,
"research_report"
,
"special_exam_paper"
]
)
parser
.
add_argument
(
"--tool_name"
,
type
=
str
,
required
=
True
,
help
=
"tool name"
,
)
parser
.
add_argument
(
"--download_dir"
,
type
=
str
,
required
=
True
,
help
=
"input download dir"
,
)
parser
.
add_argument
(
"--results"
,
type
=
str
,
required
=
True
,
help
=
"results path(end with .json)"
,
)
args
=
parser
.
parse_args
()
fw
=
open
(
args
.
results
,
'w+'
,
encoding
=
'utf-8'
)
# 初始化列表来存储编辑距离和BLEU分数
class
Scoring
:
def
__init__
(
self
):
def
__init__
(
self
,
result_path
):
self
.
edit_distances
=
[]
self
.
bleu_scores
=
[]
self
.
sim_scores
=
[]
self
.
filenames
=
[]
self
.
score_dict
=
{}
self
.
anntion_cnt
=
0
self
.
fw
=
open
(
result_path
,
"w+"
)
def
simple_bleu_score
(
self
,
candidate
,
reference
):
candidate_tokens
=
word_tokenize
(
candidate
)
reference_tokens
=
word_tokenize
(
reference
)
...
...
@@ -93,12 +65,12 @@ class Scoring:
class_average_edit_distance
=
sum
(
edit_distances
)
/
len
(
edit_distances
)
if
edit_distances
else
0
class_average_bleu_score
=
sum
(
bleu_scores
)
/
len
(
bleu_scores
)
if
bleu_scores
else
0
class_average_sim_score
=
sum
(
sim_scores
)
/
len
(
sim_scores
)
if
sim_scores
else
0
fw
.
write
(
json
.
dumps
(
class_dict
,
ensure_ascii
=
False
)
+
"
\n
"
)
self
.
fw
.
write
(
json
.
dumps
(
class_dict
,
ensure_ascii
=
False
)
+
"
\n
"
)
ratio
=
len
(
class_dict
)
/
total_file
fw
.
write
(
f
"{tool_type} extract ratio: {ratio}"
+
"
\n
"
)
fw
.
write
(
f
"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}"
+
"
\n
"
)
fw
.
write
(
f
"{tool_type} Average BLEU Score: {class_average_bleu_score}"
+
"
\n
"
)
fw
.
write
(
f
"{tool_type} Average Sim Score: {class_average_sim_score}"
+
"
\n
"
)
self
.
fw
.
write
(
f
"{tool_type} extract ratio: {ratio}"
+
"
\n
"
)
self
.
fw
.
write
(
f
"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}"
+
"
\n
"
)
self
.
fw
.
write
(
f
"{tool_type} Average BLEU Score: {class_average_bleu_score}"
+
"
\n
"
)
self
.
fw
.
write
(
f
"{tool_type} Average Sim Score: {class_average_sim_score}"
+
"
\n
"
)
print
(
f
"{tool_type} extract ratio: {ratio}"
)
print
(
f
"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}"
)
...
...
@@ -115,8 +87,8 @@ class Scoring:
over_all_dict
[
"average_edit_distance"
]
=
average_edit_distance
over_all_dict
[
"average_bleu_score"
]
=
average_bleu_score
over_all_dict
[
"average_sim_score"
]
=
average_sim_score
fw
.
write
(
json
.
dumps
(
over_all_dict
,
ensure_ascii
=
False
)
+
"
\n
"
)
self
.
fw
.
write
(
json
.
dumps
(
over_all_dict
,
ensure_ascii
=
False
)
+
"
\n
"
)
return
over_all_dict
def
calculate_similarity_total
(
self
,
tool_type
,
file_types
,
download_dir
):
for
file_type
in
file_types
:
...
...
@@ -124,17 +96,3 @@ class Scoring:
actual
=
os
.
path
.
join
(
download_dir
,
file_type
,
tool_type
,
"cleaned"
)
self
.
calculate_similarity
(
annotion
,
actual
,
file_type
)
if
__name__
==
"__main__"
:
file_types
=
list
()
tool_type
=
args
.
tool_name
download_dir
=
args
.
download_dir
if
args
.
document_types
:
print
(
"Selected types:"
,
args
.
document_types
)
for
type_
in
args
.
document_types
:
file_types
.
append
(
type_
)
else
:
print
(
"No types selected"
)
print
(
f
"Type {file_types} is selected. Executing related operations..."
)
score
=
Scoring
()
score
.
calculate_similarity_total
(
tool_type
,
file_types
,
download_dir
)
score
.
summary_scores
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment