Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
351078f1
Unverified
Commit
351078f1
authored
Jul 13, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Jul 13, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #141 from dt-yy/master
add ci
parents
833189a3
4fcaf679
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
12 additions
and
17 deletions
+12
-17
calculate_score.py
tests/test_cli/lib/calculate_score.py
+1
-1
pre_clean.py
tests/test_cli/lib/pre_clean.py
+0
-3
scoring.py
tests/test_cli/lib/scoring.py
+3
-0
result.json
tests/test_cli/pdf_dev/result.json
+1
-0
test_bench.py
tests/test_cli/test_bench.py
+7
-13
No files found.
tests/test_cli/lib/calculate_score.py
View file @
351078f1
...
@@ -4,12 +4,12 @@ calculate_score
...
@@ -4,12 +4,12 @@ calculate_score
import
os
import
os
import
re
import
re
import
json
import
json
from
Levenshtein
import
distance
from
lib
import
scoring
from
lib
import
scoring
from
nltk.translate.bleu_score
import
sentence_bleu
,
SmoothingFunction
from
nltk.translate.bleu_score
import
sentence_bleu
,
SmoothingFunction
from
nltk.tokenize
import
word_tokenize
from
nltk.tokenize
import
word_tokenize
import
nltk
import
nltk
nltk
.
download
(
'punkt'
)
nltk
.
download
(
'punkt'
)
from
Levenshtein
import
distance
class
Scoring
:
class
Scoring
:
"""
"""
...
...
tests/test_cli/lib/pre_clean.py
View file @
351078f1
...
@@ -118,9 +118,6 @@ def clean_data(prod_type, download_dir):
...
@@ -118,9 +118,6 @@ def clean_data(prod_type, download_dir):
with
open
(
input_file
,
'r'
,
encoding
=
'utf-8'
)
as
fr
:
with
open
(
input_file
,
'r'
,
encoding
=
'utf-8'
)
as
fr
:
content
=
fr
.
read
()
content
=
fr
.
read
()
new_content
=
clean_markdown_images
(
content
)
new_content
=
clean_markdown_images
(
content
)
new_content
=
convert_html_table_to_md
(
new_content
)
new_content
=
convert_latext_to_md
(
new_content
)
new_content
=
convert_htmltale_to_md
(
new_content
)
with
open
(
output_file
,
'w'
,
encoding
=
'utf-8'
)
as
fw
:
with
open
(
output_file
,
'w'
,
encoding
=
'utf-8'
)
as
fw
:
fw
.
write
(
new_content
)
fw
.
write
(
new_content
)
...
...
tests/test_cli/lib/scoring.py
View file @
351078f1
"""
Calculate simscore, refer to (https://github.com/VikParuchuri/marker?tab=readme-ov-file)
"""
import
math
import
math
from
rapidfuzz
import
fuzz
from
rapidfuzz
import
fuzz
...
...
tests/test_cli/pdf_dev/result.json
0 → 100644
View file @
351078f1
{
"average_sim_score"
:
0.6505598645664856
,
"average_edit_distance"
:
0.2514908429188901
,
"average_bleu_score"
:
0.5808819533975296
}
\ No newline at end of file
tests/test_cli/test_bench.py
View file @
351078f1
...
@@ -11,9 +11,6 @@ from conf import conf
...
@@ -11,9 +11,6 @@ from conf import conf
code_path
=
os
.
environ
.
get
(
'GITHUB_WORKSPACE'
)
code_path
=
os
.
environ
.
get
(
'GITHUB_WORKSPACE'
)
pdf_dev_path
=
conf
.
conf
[
"pdf_dev_path"
]
pdf_dev_path
=
conf
.
conf
[
"pdf_dev_path"
]
pdf_res_path
=
conf
.
conf
[
"pdf_res_path"
]
pdf_res_path
=
conf
.
conf
[
"pdf_res_path"
]
last_simscore
=
0
last_editdistance
=
0
last_bleu
=
0
class
TestBench
():
class
TestBench
():
"""
"""
...
@@ -23,7 +20,6 @@ class TestBench():
...
@@ -23,7 +20,6 @@ class TestBench():
"""
"""
ci benchmark
ci benchmark
"""
"""
try
:
fr
=
open
(
os
.
path
.
join
(
pdf_dev_path
,
"result.json"
),
"r"
,
encoding
=
"utf-8"
)
fr
=
open
(
os
.
path
.
join
(
pdf_dev_path
,
"result.json"
),
"r"
,
encoding
=
"utf-8"
)
lines
=
fr
.
readlines
()
lines
=
fr
.
readlines
()
last_line
=
lines
[
-
1
]
.
strip
()
last_line
=
lines
[
-
1
]
.
strip
()
...
@@ -31,8 +27,6 @@ class TestBench():
...
@@ -31,8 +27,6 @@ class TestBench():
last_simscore
=
last_score
[
"average_sim_score"
]
last_simscore
=
last_score
[
"average_sim_score"
]
last_editdistance
=
last_score
[
"average_edit_distance"
]
last_editdistance
=
last_score
[
"average_edit_distance"
]
last_bleu
=
last_score
[
"average_bleu_score"
]
last_bleu
=
last_score
[
"average_bleu_score"
]
except
IOError
:
print
(
"result.json not exist"
)
os
.
system
(
f
"python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}"
)
os
.
system
(
f
"python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}"
)
now_score
=
get_score
()
now_score
=
get_score
()
print
(
"now_score:"
,
now_score
)
print
(
"now_score:"
,
now_score
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment