Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
2e79da59
Commit
2e79da59
authored
Jul 13, 2024
by
quyuan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add ci
parent
65e83285
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
31 additions
and
7 deletions
+31
-7
benchmark.yml
.github/workflows/benchmark.yml
+0
-2
benchmark.py
tests/benchmark/benchmark.py
+1
-1
env.sh
tests/benchmark/env.sh
+0
-4
test_cli.py
tests/test_cli/test_cli.py
+30
-0
No files found.
.github/workflows/benchmark.yml
View file @
2e79da59
...
...
@@ -37,6 +37,4 @@ jobs:
echo "start test"
cd $GITHUB_WORKSPACE/tests/benchmark/
tree
sh env.sh
python benchmark.py
tests/benchmark/benchmark.py
View file @
2e79da59
...
...
@@ -18,7 +18,7 @@ def test_cli():
rm_cmd
=
f
"rm -rf {pdf_res_path}"
os
.
system
(
rm_cmd
)
os
.
makedirs
(
pdf_res_path
)
cmd
=
f
'magic-pdf pdf-command --pdf {os.path.join(pdf_dev_path, "mineru")}
--inside_model true
'
cmd
=
f
'magic-pdf pdf-command --pdf {os.path.join(pdf_dev_path, "mineru")}'
os
.
system
(
cmd
)
for
root
,
dirs
,
files
in
os
.
walk
(
pdf_res_path
):
for
magic_file
in
files
:
...
...
tests/benchmark/env.sh
View file @
2e79da59
conda create
-n
MinerU
python
=
3.10
conda activate MinerU
pip
install
magic-pdf
pip
install
magic-pdf[full-cpu]
pip
install
detectron2
--extra-index-url
https://myhloli.github.io/wheels/
git lfs
install
git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit
#cp magic-pdf.template.json ~/magic-pdf.json
\ No newline at end of file
tests/test_cli/test_cli.py
View file @
2e79da59
...
...
@@ -4,6 +4,13 @@ from conf import conf
import
subprocess
from
lib
import
common
import
logging
import
os
import
json
from
loguru
import
logger
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
pdf_res_path
=
conf
.
conf
[
"pdf_res_path"
]
code_path
=
conf
.
conf
[
"code_path"
]
pdf_dev_path
=
conf
.
conf
[
"pdf_dev_path"
]
...
...
@@ -18,6 +25,29 @@ class TestCli:
common
.
check_shell
(
cmd
)
#common.count_folders_and_check_contents(pdf_res_path)
def
test_pdf_sdk
(
self
):
"""
pdf sdk 方式解析
"""
demo_names
=
list
()
for
pdf_file
in
os
.
listdir
(
pdf_dev_path
):
if
pdf_file
.
endswith
(
'.pdf'
):
demo_names
.
append
(
pdf_file
.
split
(
'.'
)[
0
])
for
demo_name
in
demo_names
:
model_path
=
os
.
path
.
join
(
pdf_dev_path
,
f
"{demo_name}.json"
)
pdf_path
=
os
.
path
.
join
(
pdf_dev_path
,
f
"{demo_name}.pdf"
)
pdf_bytes
=
open
(
pdf_path
,
"rb"
)
.
read
()
model_json
=
json
.
loads
(
open
(
model_path
,
"r"
,
encoding
=
"utf-8"
)
.
read
())
image_writer
=
DiskReaderWriter
(
pdf_dev_path
)
image_dir
=
str
(
os
.
path
.
basename
(
pdf_dev_path
))
jso_useful_key
=
{
"_pdf_type"
:
""
,
"model_list"
:
model_json
}
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
)
pipe
.
pipe_classify
()
pipe
.
pipe_parse
()
md_content
=
pipe
.
pipe_mk_markdown
(
image_dir
,
drop_mode
=
"none"
)
with
open
(
f
"{demo_name}.md"
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
md_content
)
# def test_pdf_specify_jsonl(self):
# """
# 输入jsonl, 默认方式解析
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment