Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
4bedda7b
Commit
4bedda7b
authored
Jun 28, 2024
by
quyuan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update require txt
parent
b474a00a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
10 additions
and
52 deletions
+10
-52
benchmark.py
tools/benchmark.py
+2
-2
markdown_calculate.py
tools/markdown_calculate.py
+8
-50
No files found.
tools/benchmark.py
View file @
4bedda7b
...
...
@@ -36,8 +36,8 @@ def calculate_score():
os
.
system
(
cmd
)
cmd
=
"cd
%
s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir
%
s"
%
(
code_path
,
data_path
)
os
.
system
(
cmd
)
score
=
markdown_calculate
.
Scoring
()
score
.
calculate_similarity_total
(
"magicpdf"
,
file_types
,
os
.
path
.
join
(
data_path
,
"result.json"
)
)
score
=
markdown_calculate
.
Scoring
(
os
.
path
.
join
(
data_path
,
"result.json"
)
)
score
.
calculate_similarity_total
(
"magicpdf"
,
file_types
,
data_path
)
res
=
score
.
summary_scores
()
return
res
...
...
tools/markdown_calculate.py
View file @
4bedda7b
...
...
@@ -7,44 +7,16 @@ import re
import
scoring
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
"get directory"
)
parser
.
add_argument
(
'--document_types'
,
nargs
=
'+'
,
choices
=
[
"academic_literature"
,
"atlas"
,
"courseware"
,
"colorful_textbook"
,
"historical_documents"
,
"notes"
,
"ordinary_books"
,
"ordinary_exam_paper"
,
"ordinary_textbook"
,
"research_report"
,
"special_exam_paper"
],
help
=
'Choose one or more document_types'
,
default
=
[
"academic_literature"
,
"atlas"
,
"courseware"
,
"colorful_textbook"
,
"historical_documents"
,
"notes"
,
"ordinary_books"
,
"ordinary_exam_paper"
,
"ordinary_textbook"
,
"research_report"
,
"special_exam_paper"
]
)
parser
.
add_argument
(
"--tool_name"
,
type
=
str
,
required
=
True
,
help
=
"tool name"
,
)
parser
.
add_argument
(
"--download_dir"
,
type
=
str
,
required
=
True
,
help
=
"input download dir"
,
)
parser
.
add_argument
(
"--results"
,
type
=
str
,
required
=
True
,
help
=
"results path(end with .json)"
,
)
args
=
parser
.
parse_args
()
fw
=
open
(
args
.
results
,
'w+'
,
encoding
=
'utf-8'
)
# 初始化列表来存储编辑距离和BLEU分数
class
Scoring
:
def
__init__
(
self
):
def
__init__
(
self
,
result_path
):
self
.
edit_distances
=
[]
self
.
bleu_scores
=
[]
self
.
sim_scores
=
[]
self
.
filenames
=
[]
self
.
score_dict
=
{}
self
.
anntion_cnt
=
0
self
.
fw
=
open
(
result_path
,
"w+"
)
def
simple_bleu_score
(
self
,
candidate
,
reference
):
candidate_tokens
=
word_tokenize
(
candidate
)
reference_tokens
=
word_tokenize
(
reference
)
...
...
@@ -93,12 +65,12 @@ class Scoring:
class_average_edit_distance
=
sum
(
edit_distances
)
/
len
(
edit_distances
)
if
edit_distances
else
0
class_average_bleu_score
=
sum
(
bleu_scores
)
/
len
(
bleu_scores
)
if
bleu_scores
else
0
class_average_sim_score
=
sum
(
sim_scores
)
/
len
(
sim_scores
)
if
sim_scores
else
0
fw
.
write
(
json
.
dumps
(
class_dict
,
ensure_ascii
=
False
)
+
"
\n
"
)
self
.
fw
.
write
(
json
.
dumps
(
class_dict
,
ensure_ascii
=
False
)
+
"
\n
"
)
ratio
=
len
(
class_dict
)
/
total_file
fw
.
write
(
f
"{tool_type} extract ratio: {ratio}"
+
"
\n
"
)
fw
.
write
(
f
"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}"
+
"
\n
"
)
fw
.
write
(
f
"{tool_type} Average BLEU Score: {class_average_bleu_score}"
+
"
\n
"
)
fw
.
write
(
f
"{tool_type} Average Sim Score: {class_average_sim_score}"
+
"
\n
"
)
self
.
fw
.
write
(
f
"{tool_type} extract ratio: {ratio}"
+
"
\n
"
)
self
.
fw
.
write
(
f
"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}"
+
"
\n
"
)
self
.
fw
.
write
(
f
"{tool_type} Average BLEU Score: {class_average_bleu_score}"
+
"
\n
"
)
self
.
fw
.
write
(
f
"{tool_type} Average Sim Score: {class_average_sim_score}"
+
"
\n
"
)
print
(
f
"{tool_type} extract ratio: {ratio}"
)
print
(
f
"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}"
)
...
...
@@ -115,7 +87,7 @@ class Scoring:
over_all_dict
[
"average_edit_distance"
]
=
average_edit_distance
over_all_dict
[
"average_bleu_score"
]
=
average_bleu_score
over_all_dict
[
"average_sim_score"
]
=
average_sim_score
fw
.
write
(
json
.
dumps
(
over_all_dict
,
ensure_ascii
=
False
)
+
"
\n
"
)
self
.
fw
.
write
(
json
.
dumps
(
over_all_dict
,
ensure_ascii
=
False
)
+
"
\n
"
)
return
over_all_dict
def
calculate_similarity_total
(
self
,
tool_type
,
file_types
,
download_dir
):
...
...
@@ -124,17 +96,3 @@ class Scoring:
actual
=
os
.
path
.
join
(
download_dir
,
file_type
,
tool_type
,
"cleaned"
)
self
.
calculate_similarity
(
annotion
,
actual
,
file_type
)
if
__name__
==
"__main__"
:
file_types
=
list
()
tool_type
=
args
.
tool_name
download_dir
=
args
.
download_dir
if
args
.
document_types
:
print
(
"Selected types:"
,
args
.
document_types
)
for
type_
in
args
.
document_types
:
file_types
.
append
(
type_
)
else
:
print
(
"No types selected"
)
print
(
f
"Type {file_types} is selected. Executing related operations..."
)
score
=
Scoring
()
score
.
calculate_similarity_total
(
tool_type
,
file_types
,
download_dir
)
score
.
summary_scores
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment