Commit cf385779 authored by myhloli's avatar myhloli

feat(docs): automate model download and configuration

- Add scripts to download models and update configuration file
- Remove manual steps for modifying model paths
- Update documentation for both ModelScope and HuggingFace model downloads
- Improve user experience by automating the entire process
parent 0b2b0cef
# use modelscope sdk download models
import os
import requests
import json
from modelscope import snapshot_download
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
print(f"model dir is: {model_dir}/models")
def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
else:
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
# 解析JSON内容
data = response.json()
# 修改内容
for key, value in modifications.items():
data[key] = value
# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
model_dir = model_dir + "/models"
print(f"model_dir is: {model_dir}")
print(f"layoutreader_model_dir is: {layoutreader_model_dir}")
json_url = 'https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json'
config_file_name = "magic-pdf.json"
home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}
download_and_modify_json(json_url, config_file, json_mods)
print(f"The configuration file has been configured successfully, the path is: {config_file}")
import os
import requests
import json
from huggingface_hub import snapshot_download
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
layoutreader_model_dir = snapshot_download('hantian/layoutreader')
print(f"model dir is: {model_dir}/models")
def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
else:
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
# 解析JSON内容
data = response.json()
# 修改内容
for key, value in modifications.items():
data[key] = value
# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
layoutreader_model_dir = snapshot_download('hantian/layoutreader')
model_dir = model_dir + "/models"
print(f"model_dir is: {model_dir}")
print(f"layoutreader_model_dir is: {layoutreader_model_dir}")
json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
config_file_name = "magic-pdf.json"
home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}
download_and_modify_json(json_url, config_file, json_mods)
print(f"The configuration file has been configured successfully, the path is: {config_file}")
......@@ -10,11 +10,8 @@ pip install huggingface_hub
wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py
python download_models_hf.py
```
After the Python script finishes executing, it will output the directory where the models are downloaded.
The Python script will automatically download the model files and configure the model directory in the configuration file.
### 2. To modify the model path address in the configuration file
Additionally, in `~/magic-pdf.json`, update the model directory path to the absolute path of the `models` directory output by the previous Python script. Otherwise, you will encounter an error indicating that the model cannot be loaded.
# How to update models previously downloaded
......
......@@ -22,12 +22,10 @@ pip install modelscope
wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py
python download_models.py
```
python脚本执行完毕后,会输出模型下载目录
## 下载完成后的操作:修改magic-pdf.json中的模型路径
`~/magic-pdf.json`里修改模型的目录指向上一步脚本输出的models目录的绝对路径,否则会报模型无法加载的错误。
python脚本会自动下载模型文件并配置好配置文件中的模型目录
配置文件可以在用户目录中找到,文件名为`magic-pdf.json`
> windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名"
# 此前下载过模型,如何更新
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment