Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
065bf993
Commit
065bf993
authored
Nov 01, 2024
by
xu rui
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix: download model file scripts path
parent
f6d381bf
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
133 additions
and
8 deletions
+133
-8
.readthedocs.yaml
.readthedocs.yaml
+2
-2
.readthedocs.yaml
next_docs/en/.readthedocs.yaml
+2
-2
download_model_weight_files.rst
...ocs/en/user_guide/install/download_model_weight_files.rst
+1
-1
install.rst
next_docs/en/user_guide/install/install.rst
+1
-1
.readthedocs.yaml
next_docs/zh_cn/.readthedocs.yaml
+2
-2
download_models.py
scripts/download_models.py
+59
-0
download_models_hf.py
scripts/download_models_hf.py
+66
-0
No files found.
.readthedocs.yaml
View file @
065bf993
...
...
@@ -10,7 +10,7 @@ formats:
python
:
install
:
-
requirements
:
docs/zh_cn/requirements.txt
-
requirements
:
next_
docs/zh_cn/requirements.txt
sphinx
:
configuration
:
docs/zh_cn/conf.py
configuration
:
next_
docs/zh_cn/conf.py
next_docs/en/.readthedocs.yaml
View file @
065bf993
...
...
@@ -10,7 +10,7 @@ formats:
python
:
install
:
-
requirements
:
docs/requirements.txt
-
requirements
:
next_
docs/requirements.txt
sphinx
:
configuration
:
docs/en/conf.py
configuration
:
next_
docs/en/conf.py
next_docs/en/user_guide/install/download_model_weight_files.rst
View file @
065bf993
...
...
@@ -17,7 +17,7 @@ Use a Python Script to Download Model Files from Hugging Face
..
code
::
bash
pip
install
huggingface_hub
wget
https
://
github
.
com
/
opendatalab
/
MinerU
/
raw
/
master
/
doc
s
/
download_models_hf
.
py
-
O
download_models_hf
.
py
wget
https
://
github
.
com
/
opendatalab
/
MinerU
/
raw
/
master
/
script
s
/
download_models_hf
.
py
-
O
download_models_hf
.
py
python
download_models_hf
.
py
The
Python
script
will
automatically
download
the
model
files
and
...
...
next_docs/en/user_guide/install/install.rst
View file @
065bf993
...
...
@@ -100,7 +100,7 @@ Download model weight files
.. code-block:: shell
pip install huggingface_hub
wget https://github.com/opendatalab/MinerU/raw/master/
doc
s/download_models_hf.py -O download_models_hf.py
wget https://github.com/opendatalab/MinerU/raw/master/
script
s/download_models_hf.py -O download_models_hf.py
python download_models_hf.py
...
...
next_docs/zh_cn/.readthedocs.yaml
View file @
065bf993
...
...
@@ -10,7 +10,7 @@ formats:
python
:
install
:
-
requirements
:
docs/requirements.txt
-
requirements
:
next_
docs/requirements.txt
sphinx
:
configuration
:
docs/zh_cn/conf.py
configuration
:
next_
docs/zh_cn/conf.py
scripts/download_models.py
0 → 100644
View file @
065bf993
import
json
import
os
import
requests
from
modelscope
import
snapshot_download
def
download_json
(
url
):
# 下载JSON文件
response
=
requests
.
get
(
url
)
response
.
raise_for_status
()
# 检查请求是否成功
return
response
.
json
()
def
download_and_modify_json
(
url
,
local_filename
,
modifications
):
if
os
.
path
.
exists
(
local_filename
):
data
=
json
.
load
(
open
(
local_filename
))
config_version
=
data
.
get
(
'config_version'
,
'0.0.0'
)
if
config_version
<
'1.0.0'
:
data
=
download_json
(
url
)
else
:
data
=
download_json
(
url
)
# 修改内容
for
key
,
value
in
modifications
.
items
():
data
[
key
]
=
value
# 保存修改后的内容
with
open
(
local_filename
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
(
data
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
if
__name__
==
'__main__'
:
mineru_patterns
=
[
"models/Layout/LayoutLMv3/*"
,
"models/Layout/YOLO/*"
,
"models/MFD/YOLO/*"
,
"models/MFR/unimernet_small/*"
,
"models/TabRec/TableMaster/*"
,
"models/TabRec/StructEqTable/*"
,
]
model_dir
=
snapshot_download
(
'opendatalab/PDF-Extract-Kit-1.0'
,
allow_patterns
=
mineru_patterns
)
layoutreader_model_dir
=
snapshot_download
(
'ppaanngggg/layoutreader'
)
model_dir
=
model_dir
+
'/models'
print
(
f
'model_dir is: {model_dir}'
)
print
(
f
'layoutreader_model_dir is: {layoutreader_model_dir}'
)
json_url
=
'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json'
config_file_name
=
'magic-pdf.json'
home_dir
=
os
.
path
.
expanduser
(
'~'
)
config_file
=
os
.
path
.
join
(
home_dir
,
config_file_name
)
json_mods
=
{
'models-dir'
:
model_dir
,
'layoutreader-model-dir'
:
layoutreader_model_dir
,
}
download_and_modify_json
(
json_url
,
config_file
,
json_mods
)
print
(
f
'The configuration file has been configured successfully, the path is: {config_file}'
)
scripts/download_models_hf.py
0 → 100644
View file @
065bf993
import
json
import
os
import
requests
from
huggingface_hub
import
snapshot_download
def
download_json
(
url
):
# 下载JSON文件
response
=
requests
.
get
(
url
)
response
.
raise_for_status
()
# 检查请求是否成功
return
response
.
json
()
def
download_and_modify_json
(
url
,
local_filename
,
modifications
):
if
os
.
path
.
exists
(
local_filename
):
data
=
json
.
load
(
open
(
local_filename
))
config_version
=
data
.
get
(
'config_version'
,
'0.0.0'
)
if
config_version
<
'1.0.0'
:
data
=
download_json
(
url
)
else
:
data
=
download_json
(
url
)
# 修改内容
for
key
,
value
in
modifications
.
items
():
data
[
key
]
=
value
# 保存修改后的内容
with
open
(
local_filename
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
(
data
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
if
__name__
==
'__main__'
:
mineru_patterns
=
[
"models/Layout/LayoutLMv3/*"
,
"models/Layout/YOLO/*"
,
"models/MFD/YOLO/*"
,
"models/MFR/unimernet_small/*"
,
"models/TabRec/TableMaster/*"
,
"models/TabRec/StructEqTable/*"
,
]
model_dir
=
snapshot_download
(
'opendatalab/PDF-Extract-Kit-1.0'
,
allow_patterns
=
mineru_patterns
)
layoutreader_pattern
=
[
"*.json"
,
"*.safetensors"
,
]
layoutreader_model_dir
=
snapshot_download
(
'hantian/layoutreader'
,
allow_patterns
=
layoutreader_pattern
)
model_dir
=
model_dir
+
'/models'
print
(
f
'model_dir is: {model_dir}'
)
print
(
f
'layoutreader_model_dir is: {layoutreader_model_dir}'
)
json_url
=
'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json'
config_file_name
=
'magic-pdf.json'
home_dir
=
os
.
path
.
expanduser
(
'~'
)
config_file
=
os
.
path
.
join
(
home_dir
,
config_file_name
)
json_mods
=
{
'models-dir'
:
model_dir
,
'layoutreader-model-dir'
:
layoutreader_model_dir
,
}
download_and_modify_json
(
json_url
,
config_file
,
json_mods
)
print
(
f
'The configuration file has been configured successfully, the path is: {config_file}'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment