Commit acab8de5 authored by myhloli's avatar myhloli

docs: update model download instructions and simplify demo scripts

- Update model download instructions for versions 0.9.x and later
- Simplify demo scripts by removing unnecessary model configuration
- Add visualization function to draw bounding boxes
- Update CLI help message with new URL
parent 460ea6b4
import os
import json
from loguru import logger
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
import magic_pdf.model as model_config
model_config.__use_inside_model__ = True
try:
current_script_dir = os.path.dirname(os.path.abspath(__file__))
demo_name = "demo1"
pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
model_path = os.path.join(current_script_dir, f"{demo_name}.json")
pdf_bytes = open(pdf_path, "rb").read()
# model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
model_json = [] # model_json传空list使用内置模型解析
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
jso_useful_key = {"_pdf_type": "", "model_list": []}
local_image_dir = os.path.join(current_script_dir, 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer = DiskReaderWriter(local_image_dir)
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
"""如果没有传入有效的模型数据,则使用内置model解析"""
if len(model_json) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
else:
logger.error("need model list input")
exit(1)
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
......
......@@ -4,13 +4,12 @@ import copy
from loguru import logger
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
import magic_pdf.model as model_config
model_config.__use_inside_model__ = True
# todo: 设备类型选择 (?)
......@@ -47,11 +46,20 @@ def json_md_dump(
)
# 可视化
def draw_visualization_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name):
# 画布局框,附带排序结果
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
# 画 span 框
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
def pdf_parse_main(
pdf_path: str,
parse_method: str = 'auto',
model_json_path: str = None,
is_json_md_dump: bool = True,
is_draw_visualization_bbox: bool = True,
output_dir: str = None
):
"""
......@@ -108,11 +116,7 @@ def pdf_parse_main(
# 如果没有传入模型数据,则使用内置模型解析
if not model_json:
if model_config.__use_inside_model__:
pipe.pipe_analyze() # 解析
else:
logger.error("need model list input")
exit(1)
# 执行解析
pipe.pipe_parse()
......@@ -121,10 +125,11 @@ def pdf_parse_main(
content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")
if is_json_md_dump:
json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)
if is_draw_visualization_bbox:
draw_visualization_bbox(pipe.pdf_mid_data['pdf_info'], pdf_bytes, output_path, pdf_name)
except Exception as e:
logger.exception(e)
......@@ -132,5 +137,5 @@ def pdf_parse_main(
# 测试
if __name__ == '__main__':
pdf_path = r"C:\Users\XYTK2\Desktop\2024-2016-gb-cd-300.pdf"
pdf_path = r"D:\project\20240617magicpdf\Magic-PDF\demo\demo1.pdf"
pdf_parse_main(pdf_path)
......@@ -52,7 +52,7 @@ without method specified, auto will be used by default.""",
help="""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
You should input "Abbreviation" with language form url:
https://paddlepaddle.github.io/PaddleOCR/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
""",
default=None,
)
......
......@@ -97,8 +97,6 @@ magic-pdf -p small_ocr.pdf
If your graphics card has at least **8GB** of VRAM, follow these steps to test CUDA acceleration:
> ❗ Due to the extremely limited nature of 8GB VRAM for running this application, you need to close all other programs using VRAM to ensure that 8GB of VRAM is available when running this application.
1. Modify the value of `"device-mode"` in the `magic-pdf.json` configuration file located in your home directory.
```json
{
......
......@@ -98,8 +98,6 @@ magic-pdf -p small_ocr.pdf
如果您的显卡显存大于等于 **8GB** ,可以进行以下流程,测试CUDA解析加速效果
> ❗️因8GB显存运行本应用非常极限,需要关闭所有其他正在使用显存的程序以确保本应用运行时有足额8GB显存可用。
**1.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值**
```json
......
......@@ -60,8 +60,6 @@ Download a sample file from the repository and test it.
If your graphics card has at least 8GB of VRAM, follow these steps to test CUDA-accelerated parsing performance.
> ❗ Due to the extremely limited nature of 8GB VRAM for running this application, you need to close all other programs using VRAM to ensure that 8GB of VRAM is available when running this application.
1. **Overwrite the installation of torch and torchvision** supporting CUDA.
```
......
......@@ -61,8 +61,6 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h
如果您的显卡显存大于等于 **8GB** ,可以进行以下流程,测试CUDA解析加速效果
> ❗️因8GB显存运行本应用非常极限,需要关闭所有其他正在使用显存的程序以确保本应用运行时有足额8GB显存可用。
**1.覆盖安装支持cuda的torch和torchvision**
```bash
......
......@@ -22,7 +22,9 @@ The configuration file can be found in the user directory, with the filename `ma
> Due to feedback from some users that downloading model files using git lfs was incomplete or resulted in corrupted model files, this method is no longer recommended.
If you previously downloaded model files via git lfs, you can navigate to the previous download directory and use the `git pull` command to update the model.
When magic-pdf <= 0.8.1, if you have previously downloaded the model files via git lfs, you can navigate to the previous download directory and update the models using the `git pull` command.
> For versions 0.9.x and later, due to the repository change and the addition of the layout sorting model in PDF-Extract-Kit 1.0, the models cannot be updated using the `git pull` command. Instead, a Python script must be used for one-click updates.
## 2. Models downloaded via Hugging Face or Model Scope
......
......@@ -34,14 +34,10 @@ python脚本会自动下载模型文件并配置好配置文件中的模型目
> 由于部分用户反馈通过git lfs下载模型文件遇到下载不全和模型文件损坏情况,现已不推荐使用该方式下载。
如此前通过 git lfs 下载过模型文件,可以进入到之前的下载目录中,通过`git pull`命令更新模型。
> 0.9.x及以后版本由于新增layout排序模型,且该模型和此前的模型不在同一仓库,不能通过`git pull`命令更新,需要单独下载。
>
> ```
> from modelscope import snapshot_download
> snapshot_download('ppaanngggg/layoutreader')
> ```
当magic-pdf <= 0.8.1时,如此前通过 git lfs 下载过模型文件,可以进入到之前的下载目录中,通过`git pull`命令更新模型。
> 0.9.x及以后版本由于PDF-Extract-Kit 1.0更换仓库和新增layout排序模型,不能通过`git pull`命令更新,需要使用python脚本一键更新。
## 2. 通过 Hugging Face 或 Model Scope 下载过模型
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment