Unverified Commit 7b787555 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub

Merge pull request #699 from myhloli/dev

feat(docs): automate model download and configuration
parents 8786d208 6c9b23c3
# use modelscope sdk download models import os
import requests
import json
from modelscope import snapshot_download from modelscope import snapshot_download
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
print(f"model dir is: {model_dir}/models") def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
else:
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
# 解析JSON内容
data = response.json()
# 修改内容
for key, value in modifications.items():
data[key] = value
# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
model_dir = model_dir + "/models"
print(f"model_dir is: {model_dir}")
print(f"layoutreader_model_dir is: {layoutreader_model_dir}")
json_url = 'https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json'
config_file_name = "magic-pdf.json"
home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}
download_and_modify_json(json_url, config_file, json_mods)
print(f"The configuration file has been configured successfully, the path is: {config_file}")
import os
import requests
import json
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
layoutreader_model_dir = snapshot_download('hantian/layoutreader')
print(f"model dir is: {model_dir}/models") def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
else:
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
# 解析JSON内容
data = response.json()
# 修改内容
for key, value in modifications.items():
data[key] = value
# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
layoutreader_model_dir = snapshot_download('hantian/layoutreader')
model_dir = model_dir + "/models"
print(f"model_dir is: {model_dir}")
print(f"layoutreader_model_dir is: {layoutreader_model_dir}")
json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
config_file_name = "magic-pdf.json"
home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}
download_and_modify_json(json_url, config_file, json_mods)
print(f"The configuration file has been configured successfully, the path is: {config_file}")
...@@ -10,12 +10,9 @@ pip install huggingface_hub ...@@ -10,12 +10,9 @@ pip install huggingface_hub
wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py
python download_models_hf.py python download_models_hf.py
``` ```
After the Python script finishes executing, it will output the directory where the models are downloaded. The Python script will automatically download the model files and configure the model directory in the configuration file.
### 2. To modify the model path address in the configuration file
Additionally, in `~/magic-pdf.json`, update the model directory path to the absolute path of the `models` directory output by the previous Python script. Otherwise, you will encounter an error indicating that the model cannot be loaded.
The configuration file can be found in the user directory, with the filename `magic-pdf.json`.
# How to update models previously downloaded # How to update models previously downloaded
......
...@@ -22,12 +22,10 @@ pip install modelscope ...@@ -22,12 +22,10 @@ pip install modelscope
wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py
python download_models.py python download_models.py
``` ```
python脚本执行完毕后,会输出模型下载目录 python脚本会自动下载模型文件并配置好配置文件中的模型目录
## 下载完成后的操作:修改magic-pdf.json中的模型路径
`~/magic-pdf.json`里修改模型的目录指向上一步脚本输出的models目录的绝对路径,否则会报模型无法加载的错误。
配置文件可以在用户目录中找到,文件名为`magic-pdf.json`
> windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名"
# 此前下载过模型,如何更新 # 此前下载过模型,如何更新
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment