Unverified Commit 6d571e2e authored by Kaiwen Liu's avatar Kaiwen Liu Committed by GitHub

Merge pull request #7 from opendatalab/dev

Dev
parents a3358878 37c335ae
*.tar *.tar
*.tar.gz *.tar.gz
*.zip *.zip
venv*/ venv*/
envs/ envs/
slurm_logs/ slurm_logs/
sync1.sh sync1.sh
data_preprocess_pj1 data_preprocess_pj1
data-preparation1 data-preparation1
__pycache__ __pycache__
*.log *.log
*.pyc *.pyc
.vscode .vscode
debug/ debug/
*.ipynb *.ipynb
.idea .idea
# vscode history # vscode history
.history .history
.DS_Store .DS_Store
.env .env
bad_words/ bad_words/
bak/ bak/
app/tests/* app/tests/*
temp/ temp/
tmp/ tmp/
tmp tmp
.vscode .vscode
.vscode/ .vscode/
ocr_demo ocr_demo
.coveragerc .coveragerc
/app/common/__init__.py /app/common/__init__.py
/magic_pdf/config/__init__.py /magic_pdf/config/__init__.py
source.dev.env source.dev.env
tmp tmp
projects/web/node_modules projects/web/node_modules
projects/web/dist projects/web/dist
projects/web_demo/web_demo/static/ projects/web_demo/web_demo/static/
cli_debug/
debug_utils/
# sphinx docs
_build/
...@@ -3,7 +3,7 @@ repos: ...@@ -3,7 +3,7 @@ repos:
rev: 5.0.4 rev: 5.0.4
hooks: hooks:
- id: flake8 - id: flake8
args: ["--max-line-length=120", "--ignore=E131,E125,W503,W504,E203"] args: ["--max-line-length=150", "--ignore=E131,E125,W503,W504,E203"]
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
rev: 5.11.5 rev: 5.11.5
hooks: hooks:
...@@ -12,11 +12,12 @@ repos: ...@@ -12,11 +12,12 @@ repos:
rev: v0.32.0 rev: v0.32.0
hooks: hooks:
- id: yapf - id: yapf
args: ["--style={based_on_style: google, column_limit: 120, indent_width: 4}"] args: ["--style={based_on_style: google, column_limit: 150, indent_width: 4}"]
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.2.1 rev: v2.2.1
hooks: hooks:
- id: codespell - id: codespell
args: ['--skip', '*.json']
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0 rev: v4.3.0
hooks: hooks:
......
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.10"
formats:
- epub
python:
install:
- requirements: docs/zh_cn/requirements.txt
sphinx:
configuration: docs/zh_cn/conf.py
This diff is collapsed.
This diff is collapsed.
import os import os
import json
from loguru import logger from loguru import logger
from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
import magic_pdf.model as model_config
model_config.__use_inside_model__ = True
try: try:
current_script_dir = os.path.dirname(os.path.abspath(__file__)) current_script_dir = os.path.dirname(os.path.abspath(__file__))
demo_name = "demo1" demo_name = "demo1"
pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf") pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
model_path = os.path.join(current_script_dir, f"{demo_name}.json")
pdf_bytes = open(pdf_path, "rb").read() pdf_bytes = open(pdf_path, "rb").read()
# model_json = json.loads(open(model_path, "r", encoding="utf-8").read()) jso_useful_key = {"_pdf_type": "", "model_list": []}
model_json = [] # model_json传空list使用内置模型解析
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
local_image_dir = os.path.join(current_script_dir, 'images') local_image_dir = os.path.join(current_script_dir, 'images')
image_dir = str(os.path.basename(local_image_dir)) image_dir = str(os.path.basename(local_image_dir))
image_writer = DiskReaderWriter(local_image_dir) image_writer = DiskReaderWriter(local_image_dir)
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify() pipe.pipe_classify()
"""如果没有传入有效的模型数据,则使用内置model解析""" pipe.pipe_analyze()
if len(model_json) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
else:
logger.error("need model list input")
exit(1)
pipe.pipe_parse() pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
with open(f"{demo_name}.md", "w", encoding="utf-8") as f: with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
......
...@@ -4,13 +4,12 @@ import copy ...@@ -4,13 +4,12 @@ import copy
from loguru import logger from loguru import logger
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
import magic_pdf.model as model_config
model_config.__use_inside_model__ = True
# todo: 设备类型选择 (?) # todo: 设备类型选择 (?)
...@@ -47,11 +46,20 @@ def json_md_dump( ...@@ -47,11 +46,20 @@ def json_md_dump(
) )
# 可视化
def draw_visualization_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name):
# 画布局框,附带排序结果
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
# 画 span 框
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
def pdf_parse_main( def pdf_parse_main(
pdf_path: str, pdf_path: str,
parse_method: str = 'auto', parse_method: str = 'auto',
model_json_path: str = None, model_json_path: str = None,
is_json_md_dump: bool = True, is_json_md_dump: bool = True,
is_draw_visualization_bbox: bool = True,
output_dir: str = None output_dir: str = None
): ):
""" """
...@@ -108,11 +116,7 @@ def pdf_parse_main( ...@@ -108,11 +116,7 @@ def pdf_parse_main(
# 如果没有传入模型数据,则使用内置模型解析 # 如果没有传入模型数据,则使用内置模型解析
if not model_json: if not model_json:
if model_config.__use_inside_model__: pipe.pipe_analyze() # 解析
pipe.pipe_analyze() # 解析
else:
logger.error("need model list input")
exit(1)
# 执行解析 # 执行解析
pipe.pipe_parse() pipe.pipe_parse()
...@@ -121,10 +125,11 @@ def pdf_parse_main( ...@@ -121,10 +125,11 @@ def pdf_parse_main(
content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none") content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none") md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")
if is_json_md_dump: if is_json_md_dump:
json_md_dump(pipe, md_writer, pdf_name, content_list, md_content) json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)
if is_draw_visualization_bbox:
draw_visualization_bbox(pipe.pdf_mid_data['pdf_info'], pdf_bytes, output_path, pdf_name)
except Exception as e: except Exception as e:
logger.exception(e) logger.exception(e)
...@@ -132,5 +137,5 @@ def pdf_parse_main( ...@@ -132,5 +137,5 @@ def pdf_parse_main(
# 测试 # 测试
if __name__ == '__main__': if __name__ == '__main__':
pdf_path = r"C:\Users\XYTK2\Desktop\2024-2016-gb-cd-300.pdf" pdf_path = r"D:\project\20240617magicpdf\Magic-PDF\demo\demo1.pdf"
pdf_parse_main(pdf_path) pdf_parse_main(pdf_path)
...@@ -11,7 +11,7 @@ pip install magic-pdf[full] ...@@ -11,7 +11,7 @@ pip install magic-pdf[full]
### 2. Encountering the error `pickle.UnpicklingError: invalid load key, 'v'.` during use ### 2. Encountering the error `pickle.UnpicklingError: invalid load key, 'v'.` during use
This might be due to an incomplete download of the model file. You can try re-downloading the model file and then try again. This might be due to an incomplete download of the model file. You can try re-downloading the model file and then try again.
Reference: https://github.com/opendatalab/MinerU/issues/143 Reference: https://github.com/opendatalab/MinerU/issues/143
### 3. Where should the model files be downloaded and how should the `/models-dir` configuration be set? ### 3. Where should the model files be downloaded and how should the `/models-dir` configuration be set?
...@@ -24,7 +24,7 @@ The path for the model files is configured in "magic-pdf.json". just like: ...@@ -24,7 +24,7 @@ The path for the model files is configured in "magic-pdf.json". just like:
} }
``` ```
This path is an absolute path, not a relative path. You can obtain the absolute path in the models directory using the "pwd" command. This path is an absolute path, not a relative path. You can obtain the absolute path in the models directory using the "pwd" command.
Reference: https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874 Reference: https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
### 4. Encountered the error `ImportError: libGL.so.1: cannot open shared object file: No such file or directory` in Ubuntu 22.04 on WSL2 ### 4. Encountered the error `ImportError: libGL.so.1: cannot open shared object file: No such file or directory` in Ubuntu 22.04 on WSL2
...@@ -38,17 +38,22 @@ sudo apt-get install libgl1-mesa-glx ...@@ -38,17 +38,22 @@ sudo apt-get install libgl1-mesa-glx
Reference: https://github.com/opendatalab/MinerU/issues/388 Reference: https://github.com/opendatalab/MinerU/issues/388
### 5. Encountered error `ModuleNotFoundError: No module named 'fairscale'` ### 5. Encountered error `ModuleNotFoundError: No module named 'fairscale'`
You need to uninstall the module and reinstall it: You need to uninstall the module and reinstall it:
```bash ```bash
pip uninstall fairscale pip uninstall fairscale
pip install fairscale pip install fairscale
``` ```
Reference: https://github.com/opendatalab/MinerU/issues/411 Reference: https://github.com/opendatalab/MinerU/issues/411
### 6. On some newer devices like the H100, the text parsed during OCR using CUDA acceleration is garbled. ### 6. On some newer devices like the H100, the text parsed during OCR using CUDA acceleration is garbled.
The compatibility of cuda11 with new graphics cards is poor, and the CUDA version used by Paddle needs to be upgraded. The compatibility of cuda11 with new graphics cards is poor, and the CUDA version used by Paddle needs to be upgraded.
```bash ```bash
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/ pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
``` ```
Reference: https://github.com/opendatalab/MinerU/issues/558 Reference: https://github.com/opendatalab/MinerU/issues/558
# 常见问题解答 # 常见问题解答
### 1.在较新版本的mac上使用命令安装pip install magic-pdf[full] zsh: no matches found: magic-pdf[full] ### 1.在较新版本的mac上使用命令安装pip install magic-pdf\[full\] zsh: no matches found: magic-pdf\[full\]
在 macOS 上,默认的 shell 从 Bash 切换到了 Z shell,而 Z shell 对于某些类型的字符串匹配有特殊的处理逻辑,这可能导致no matches found错误。 在 macOS 上,默认的 shell 从 Bash 切换到了 Z shell,而 Z shell 对于某些类型的字符串匹配有特殊的处理逻辑,这可能导致no matches found错误。
可以通过在命令行禁用globbing特性,再尝试运行安装命令 可以通过在命令行禁用globbing特性,再尝试运行安装命令
```bash ```bash
setopt no_nomatch setopt no_nomatch
pip install magic-pdf[full] pip install magic-pdf[full]
...@@ -11,41 +12,50 @@ pip install magic-pdf[full] ...@@ -11,41 +12,50 @@ pip install magic-pdf[full]
### 2.使用过程中遇到_pickle.UnpicklingError: invalid load key, 'v'.错误 ### 2.使用过程中遇到_pickle.UnpicklingError: invalid load key, 'v'.错误
可能是由于模型文件未下载完整导致,可尝试重新下载模型文件后再试 可能是由于模型文件未下载完整导致,可尝试重新下载模型文件后再试
参考:https://github.com/opendatalab/MinerU/issues/143 参考:https://github.com/opendatalab/MinerU/issues/143
### 3.模型文件应该下载到哪里/models-dir的配置应该怎么填 ### 3.模型文件应该下载到哪里/models-dir的配置应该怎么填
模型文件的路径输入是在"magic-pdf.json"中通过 模型文件的路径输入是在"magic-pdf.json"中通过
```json ```json
{ {
"models-dir": "/tmp/models" "models-dir": "/tmp/models"
} }
``` ```
进行配置的。 进行配置的。
这个路径是绝对路径而不是相对路径,绝对路径的获取可在models目录中通过命令 "pwd" 获取。 这个路径是绝对路径而不是相对路径,绝对路径的获取可在models目录中通过命令 "pwd" 获取。
参考:https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874 参考:https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
### 4.在WSL2的Ubuntu22.04中遇到报错`ImportError: libGL.so.1: cannot open shared object file: No such file or directory` ### 4.在WSL2的Ubuntu22.04中遇到报错`ImportError: libGL.so.1: cannot open shared object file: No such file or directory`
WSL2的Ubuntu22.04中缺少`libgl`库,可通过以下命令安装`libgl`库解决: WSL2的Ubuntu22.04中缺少`libgl`库,可通过以下命令安装`libgl`库解决:
```bash ```bash
sudo apt-get install libgl1-mesa-glx sudo apt-get install libgl1-mesa-glx
``` ```
参考:https://github.com/opendatalab/MinerU/issues/388 参考:https://github.com/opendatalab/MinerU/issues/388
### 5.遇到报错 `ModuleNotFoundError : Nomodulenamed 'fairscale'` ### 5.遇到报错 `ModuleNotFoundError : Nomodulenamed 'fairscale'`
需要卸载该模块并重新安装 需要卸载该模块并重新安装
```bash ```bash
pip uninstall fairscale pip uninstall fairscale
pip install fairscale pip install fairscale
``` ```
参考:https://github.com/opendatalab/MinerU/issues/411 参考:https://github.com/opendatalab/MinerU/issues/411
### 6.在部分较新的设备如H100上,使用CUDA加速OCR时解析出的文字乱码。 ### 6.在部分较新的设备如H100上,使用CUDA加速OCR时解析出的文字乱码。
cuda11对新显卡的兼容性不好,需要升级paddle使用的cuda版本 cuda11对新显卡的兼容性不好,需要升级paddle使用的cuda版本
```bash ```bash
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/ pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
``` ```
参考:https://github.com/opendatalab/MinerU/issues/558 参考:https://github.com/opendatalab/MinerU/issues/558
# Ubuntu 22.04 LTS # Ubuntu 22.04 LTS
### 1. Check if NVIDIA Drivers Are Installed ### 1. Check if NVIDIA Drivers Are Installed
```sh
nvidia-smi ```sh
``` nvidia-smi
If you see information similar to the following, it means that the NVIDIA drivers are already installed, and you can skip Step 2. ```
```plaintext
+---------------------------------------------------------------------------------------+ If you see information similar to the following, it means that the NVIDIA drivers are already installed, and you can skip Step 2.
| NVIDIA-SMI 537.34 Driver Version: 537.34 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+ Notice:`CUDA Version` should be >= 12.1, If the displayed version number is less than 12.1, please upgrade the driver.
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | ```plaintext
| | | MIG M. | +---------------------------------------------------------------------------------------+
|=========================================+======================+======================| | NVIDIA-SMI 537.34 Driver Version: 537.34 CUDA Version: 12.2 |
| 0 NVIDIA GeForce RTX 3060 Ti WDDM | 00000000:01:00.0 On | N/A | |-----------------------------------------+----------------------+----------------------+
| 0% 51C P8 12W / 200W | 1489MiB / 8192MiB | 5% Default | | GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| | | N/A | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
+-----------------------------------------+----------------------+----------------------+ | | | MIG M. |
``` |=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3060 Ti WDDM | 00000000:01:00.0 On | N/A |
| 0% 51C P8 12W / 200W | 1489MiB / 8192MiB | 5% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
```
### 2. Install the Driver ### 2. Install the Driver
If no driver is installed, use the following command:
```sh If no driver is installed, use the following command:
sudo apt-get update
sudo apt-get install nvidia-driver-545 ```sh
``` sudo apt-get update
Install the proprietary driver and restart your computer after installation. sudo apt-get install nvidia-driver-545
```sh ```
reboot
``` Install the proprietary driver and restart your computer after installation.
```sh
reboot
```
### 3. Install Anaconda ### 3. Install Anaconda
If Anaconda is already installed, skip this step.
```sh If Anaconda is already installed, skip this step.
wget https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
bash Anaconda3-2024.06-1-Linux-x86_64.sh ```sh
``` wget https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
In the final step, enter `yes`, close the terminal, and reopen it. bash Anaconda3-2024.06-1-Linux-x86_64.sh
```
In the final step, enter `yes`, close the terminal, and reopen it.
### 4. Create an Environment Using Conda ### 4. Create an Environment Using Conda
Specify Python version 3.10.
```sh Specify Python version 3.10.
conda create -n MinerU python=3.10
conda activate MinerU ```sh
``` conda create -n MinerU python=3.10
conda activate MinerU
```
### 5. Install Applications ### 5. Install Applications
```sh
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com ```sh
``` pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
```
❗ After installation, make sure to check the version of `magic-pdf` using the following command: ❗ After installation, make sure to check the version of `magic-pdf` using the following command:
```sh
magic-pdf --version ```sh
``` magic-pdf --version
If the version number is less than 0.7.0, please report the issue. ```
If the version number is less than 0.7.0, please report the issue.
### 6. Download Models ### 6. Download Models
Refer to detailed instructions on [how to download model files](how_to_download_models_en.md).
After downloading, move the `models` directory to an SSD with more space. Refer to detailed instructions on [how to download model files](how_to_download_models_en.md).
❗ After downloading the models, ensure they are complete: ## 7. Understand the Location of the Configuration File
- Check that the file sizes match the description on the website.
- If possible, verify the integrity using SHA256. After completing the [6. Download Models](#6-download-models) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
You can find the `magic-pdf.json` file in your user directory.
### 7. Configuration Before First Run
Obtain the configuration template file `magic-pdf.template.json` from the root directory of the repository. > The user directory for Linux is "/home/username".
❗ Execute the following command to copy the configuration file to your home directory, otherwise the program will not run:
```sh
wget https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json
cp magic-pdf.template.json ~/magic-pdf.json
```
Find the `magic-pdf.json` file in your home directory and configure `"models-dir"` to be the directory where the model weights from Step 6 were downloaded.
❗ Correctly specify the absolute path of the directory containing the model weights; otherwise, the program will fail due to missing model files.
```json
{
"models-dir": "/tmp/models"
}
```
### 8. First Run ### 8. First Run
Download a sample file from the repository and test it.
```sh Download a sample file from the repository and test it.
wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf
magic-pdf -p small_ocr.pdf ```sh
``` wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf
magic-pdf -p small_ocr.pdf
```
### 9. Test CUDA Acceleration ### 9. Test CUDA Acceleration
If your graphics card has at least 8GB of VRAM, follow these steps to test CUDA acceleration: If your graphics card has at least **8GB** of VRAM, follow these steps to test CUDA acceleration:
1. Modify the value of `"device-mode"` in the `magic-pdf.json` configuration file located in your home directory. 1. Modify the value of `"device-mode"` in the `magic-pdf.json` configuration file located in your home directory.
```json ```json
...@@ -105,8 +110,6 @@ If your graphics card has at least 8GB of VRAM, follow these steps to test CUDA ...@@ -105,8 +110,6 @@ If your graphics card has at least 8GB of VRAM, follow these steps to test CUDA
### 10. Enable CUDA Acceleration for OCR ### 10. Enable CUDA Acceleration for OCR
❗ The following operations require a graphics card with at least 16GB of VRAM; otherwise, the program may crash or experience reduced performance.
1. Download `paddlepaddle-gpu`. Installation will automatically enable OCR acceleration. 1. Download `paddlepaddle-gpu`. Installation will automatically enable OCR acceleration.
```sh ```sh
python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
......
# Ubuntu 22.04 LTS # Ubuntu 22.04 LTS
## 1. 检测是否已安装nvidia驱动 ## 1. 检测是否已安装nvidia驱动
```bash ```bash
nvidia-smi nvidia-smi
``` ```
如果看到类似如下的信息,说明已经安装了nvidia驱动,可以跳过步骤2 如果看到类似如下的信息,说明已经安装了nvidia驱动,可以跳过步骤2
注意:`CUDA Version` 显示的版本号应 >= 12.1,如显示的版本号小于12.1,请升级驱动
```plaintext
``` ```
+---------------------------------------------------------------------------------------+ +---------------------------------------------------------------------------------------+
| NVIDIA-SMI 537.34 Driver Version: 537.34 CUDA Version: 12.2 | | NVIDIA-SMI 537.34 Driver Version: 537.34 CUDA Version: 12.2 |
...@@ -18,96 +24,108 @@ nvidia-smi ...@@ -18,96 +24,108 @@ nvidia-smi
| | | N/A | | | | N/A |
+-----------------------------------------+----------------------+----------------------+ +-----------------------------------------+----------------------+----------------------+
``` ```
## 2. 安装驱动 ## 2. 安装驱动
如没有驱动,则通过如下命令 如没有驱动,则通过如下命令
```bash ```bash
sudo apt-get update sudo apt-get update
sudo apt-get install nvidia-driver-545 sudo apt-get install nvidia-driver-545
``` ```
安装专有驱动,安装完成后,重启电脑 安装专有驱动,安装完成后,重启电脑
```bash ```bash
reboot reboot
``` ```
## 3. 安装anacoda ## 3. 安装anacoda
如果已安装conda,可以跳过本步骤 如果已安装conda,可以跳过本步骤
```bash ```bash
wget -U NoSuchBrowser/1.0 https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Linux-x86_64.sh wget -U NoSuchBrowser/1.0 https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
bash Anaconda3-2024.06-1-Linux-x86_64.sh bash Anaconda3-2024.06-1-Linux-x86_64.sh
``` ```
最后一步输入yes,关闭终端重新打开 最后一步输入yes,关闭终端重新打开
## 4. 使用conda 创建环境 ## 4. 使用conda 创建环境
需指定python版本为3.10 需指定python版本为3.10
```bash ```bash
conda create -n MinerU python=3.10 conda create -n MinerU python=3.10
conda activate MinerU conda activate MinerU
``` ```
## 5. 安装应用 ## 5. 安装应用
```bash ```bash
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
> ❗️下载完成后,务必通过以下命令确认magic-pdf的版本是否正确 > ❗️下载完成后,务必通过以下命令确认magic-pdf的版本是否正确
> >
> ```bash > ```bash
> magic-pdf --version > magic-pdf --version
>``` > ```
>
> 如果版本号小于0.7.0,请到issue中向我们反馈 > 如果版本号小于0.7.0,请到issue中向我们反馈
## 6. 下载模型 ## 6. 下载模型
详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
下载后请将models目录移动到空间较大的ssd磁盘目录
> ❗️模型下载后请务必检查模型文件是否下载完整
>
> 请检查目录下的模型文件大小与网页上描述是否一致,如果可以的话,最好通过sha256校验模型是否下载完整
>
## 7. 第一次运行前的配置
在仓库根目录可以获得 [magic-pdf.template.json](../magic-pdf.template.json) 配置模版文件
> ❗️务必执行以下命令将配置文件拷贝到【用户目录】下,否则程序将无法运行
>
> linux用户目录为 "/home/用户名"
```bash
wget https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json
cp magic-pdf.template.json ~/magic-pdf.json
```
在用户目录中找到magic-pdf.json文件并配置"models-dir"为[6. 下载模型](#6-下载模型)中下载的模型权重文件所在目录 详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
> ❗️务必正确配置模型权重文件所在目录的【绝对路径】,否则会因为找不到模型文件而导致程序无法运行
> ## 7. 了解配置文件存放的位置
```json
{ 完成[6.下载模型](#6-下载模型)步骤后,脚本会自动生成用户目录下的magic-pdf.json文件,并自动配置默认模型路径。
"models-dir": "/tmp/models" 您可在【用户目录】下找到magic-pdf.json文件。
}
``` > linux用户目录为 "/home/用户名"
## 8. 第一次运行 ## 8. 第一次运行
从仓库中下载样本文件,并测试 从仓库中下载样本文件,并测试
```bash ```bash
wget https://gitee.com/myhloli/MinerU/raw/master/demo/small_ocr.pdf wget https://gitee.com/myhloli/MinerU/raw/master/demo/small_ocr.pdf
magic-pdf -p small_ocr.pdf magic-pdf -p small_ocr.pdf
``` ```
## 9. 测试CUDA加速 ## 9. 测试CUDA加速
如果您的显卡显存大于等于8G,可以进行以下流程,测试CUDA解析加速效果
如果您的显卡显存大于等于 **8GB** ,可以进行以下流程,测试CUDA解析加速效果
**1.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值** **1.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值**
```json ```json
{ {
"device-mode":"cuda" "device-mode":"cuda"
} }
``` ```
**2.运行以下命令测试cuda加速效果** **2.运行以下命令测试cuda加速效果**
```bash ```bash
magic-pdf -p small_ocr.pdf magic-pdf -p small_ocr.pdf
``` ```
> 提示:CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断,通常情况下,`layout detection cost` 和 `mfr time` 应提速10倍以上。 > 提示:CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断,通常情况下,`layout detection cost` 和 `mfr time` 应提速10倍以上。
## 10. 为ocr开启cuda加速 ## 10. 为ocr开启cuda加速
> ❗️以下操作需显卡显存大于等于16G才可进行,否则会因为显存不足导致程序崩溃或运行速度下降
**1.下载paddlepaddle-gpu, 安装完成后会自动开启ocr加速** **1.下载paddlepaddle-gpu, 安装完成后会自动开启ocr加速**
```bash ```bash
python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
``` ```
**2.运行以下命令测试ocr加速效果** **2.运行以下命令测试ocr加速效果**
```bash ```bash
magic-pdf -p small_ocr.pdf magic-pdf -p small_ocr.pdf
``` ```
> 提示:CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断,通常情况下,`ocr cost`应提速10倍以上。 > 提示:CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断,通常情况下,`ocr cost`应提速10倍以上。
# Windows 10/11 # Windows 10/11
### 1. Install CUDA and cuDNN ### 1. Install CUDA and cuDNN
Required versions: CUDA 11.8 + cuDNN 8.7.0 Required versions: CUDA 11.8 + cuDNN 8.7.0
- CUDA 11.8: https://developer.nvidia.com/cuda-11-8-0-download-archive
- cuDNN v8.7.0 (November 28th, 2022), for CUDA 11.x: https://developer.nvidia.com/rdp/cudnn-archive - CUDA 11.8: https://developer.nvidia.com/cuda-11-8-0-download-archive
- cuDNN v8.7.0 (November 28th, 2022), for CUDA 11.x: https://developer.nvidia.com/rdp/cudnn-archive
### 2. Install Anaconda ### 2. Install Anaconda
If Anaconda is already installed, you can skip this step.
If Anaconda is already installed, you can skip this step.
Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86_64.exe Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
### 3. Create an Environment Using Conda ### 3. Create an Environment Using Conda
Python version must be 3.10.
``` Python version must be 3.10.
conda create -n MinerU python=3.10
conda activate MinerU ```
``` conda create -n MinerU python=3.10
conda activate MinerU
```
### 4. Install Applications ### 4. Install Applications
```
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com ```
``` pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
>❗️After installation, verify the version of `magic-pdf`: ```
> ```bash
> magic-pdf --version > ❗️After installation, verify the version of `magic-pdf`:
> ``` >
> If the version number is less than 0.7.0, please report it in the issues section. > ```bash
> magic-pdf --version
> ```
>
> If the version number is less than 0.7.0, please report it in the issues section.
### 5. Download Models ### 5. Download Models
Refer to detailed instructions on [how to download model files](how_to_download_models_en.md).
After downloading, move the `models` directory to an SSD with more space. Refer to detailed instructions on [how to download model files](how_to_download_models_en.md).
>❗ After downloading the models, ensure they are complete: ### 6. Understand the Location of the Configuration File
>- Check that the file sizes match the description on the website.
>- If possible, verify the integrity using SHA256. After completing the [5. Download Models](#5-download-models) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
You can find the `magic-pdf.json` file in your 【user directory】 .
### 6. Configuration Before the First Run
Obtain the configuration template file `magic-pdf.template.json` from the repository root directory. > The user directory for Windows is "C:/Users/username".
>❗️Execute the following command to copy the configuration file to your user directory, or the program will not run. ### 7. First Run
>
> In Windows, user directory is "C:\Users\username" Download a sample file from the repository and test it.
```powershell ```powershell
(New-Object System.Net.WebClient).DownloadFile('https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json', 'magic-pdf.template.json') wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
cp magic-pdf.template.json ~/magic-pdf.json magic-pdf -p small_ocr.pdf
```
### 8. Test CUDA Acceleration
If your graphics card has at least 8GB of VRAM, follow these steps to test CUDA-accelerated parsing performance.
1. **Overwrite the installation of torch and torchvision** supporting CUDA.
```
pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
``` ```
Find the `magic-pdf.json` file in your user directory and configure `"models-dir"` to point to the directory where the model weights from step 5 were downloaded. > ❗️Ensure the following versions are specified in the command:
>
> ❗️Ensure the absolute path of the model weights directory is correctly configured, or the program will fail to run due to not finding the model files. > ```
> > torch==2.3.1 torchvision==0.18.1
> In Windows, this path should include the drive letter and replace all `"\"` to `"/"`. > ```
> >
> Example: If the models are placed in the root directory of drive D, the value for `model-dir` should be `"D:/models"`. > These are the highest versions we support. Installing higher versions without specifying them will cause the program to fail.
2. **Modify the value of `"device-mode"`** in the `magic-pdf.json` configuration file located in your user directory.
```json ```json
{ {
"models-dir": "/tmp/models" "device-mode": "cuda"
} }
``` ```
### 7. First Run 3. **Run the following command to test CUDA acceleration**:
Download a sample file from the repository and test it.
```powershell
(New-Object System.Net.WebClient).DownloadFile('https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf', 'small_ocr.pdf')
magic-pdf -p small_ocr.pdf
```
### 8. Test CUDA Acceleration ```
If your graphics card has at least 8GB of VRAM, follow these steps to test CUDA-accelerated parsing performance. magic-pdf -p small_ocr.pdf
1. **Overwrite the installation of torch and torchvision** supporting CUDA. ```
```
pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
```
>❗️Ensure the following versions are specified in the command:
>```
> torch==2.3.1 torchvision==0.18.1
>```
>These are the highest versions we support. Installing higher versions without specifying them will cause the program to fail.
2. **Modify the value of `"device-mode"`** in the `magic-pdf.json` configuration file located in your user directory.
```json
{
"device-mode": "cuda"
}
```
3. **Run the following command to test CUDA acceleration**:
```
magic-pdf -p small_ocr.pdf
```
### 9. Enable CUDA Acceleration for OCR ### 9. Enable CUDA Acceleration for OCR
>❗️This operation requires at least 16GB of VRAM on your graphics card, otherwise it will cause the program to crash or slow down.
1. **Download paddlepaddle-gpu**, which will automatically enable OCR acceleration upon installation. 1. **Download paddlepaddle-gpu**, which will automatically enable OCR acceleration upon installation.
``` ```
pip install paddlepaddle-gpu==2.6.1 pip install paddlepaddle-gpu==2.6.1
``` ```
2. **Run the following command to test OCR acceleration**: 2. **Run the following command to test OCR acceleration**:
``` ```
magic-pdf -p small_ocr.pdf magic-pdf -p small_ocr.pdf
``` ```
...@@ -3,103 +3,106 @@ ...@@ -3,103 +3,106 @@
## 1. 安装cuda和cuDNN ## 1. 安装cuda和cuDNN
需要安装的版本 CUDA 11.8 + cuDNN 8.7.0 需要安装的版本 CUDA 11.8 + cuDNN 8.7.0
- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive - CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
- cuDNN v8.7.0 (November 28th, 2022), for CUDA 11.x https://developer.nvidia.com/rdp/cudnn-archive - cuDNN v8.7.0 (November 28th, 2022), for CUDA 11.x https://developer.nvidia.com/rdp/cudnn-archive
## 2. 安装anaconda ## 2. 安装anaconda
如果已安装conda,可以跳过本步骤 如果已安装conda,可以跳过本步骤
下载链接: 下载链接:
https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Windows-x86_64.exe https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
## 3. 使用conda 创建环境 ## 3. 使用conda 创建环境
需指定python版本为3.10 需指定python版本为3.10
```bash ```bash
conda create -n MinerU python=3.10 conda create -n MinerU python=3.10
conda activate MinerU conda activate MinerU
``` ```
## 4. 安装应用 ## 4. 安装应用
```bash ```bash
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
> ❗️下载完成后,务必通过以下命令确认magic-pdf的版本是否正确 > ❗️下载完成后,务必通过以下命令确认magic-pdf的版本是否正确
> >
> ```bash > ```bash
> magic-pdf --version > magic-pdf --version
>``` > ```
>
> 如果版本号小于0.7.0,请到issue中向我们反馈 > 如果版本号小于0.7.0,请到issue中向我们反馈
## 5. 下载模型 ## 5. 下载模型
详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
下载后请将models目录移动到空间较大的ssd磁盘目录
> ❗️模型下载后请务必检查模型文件是否下载完整
>
> 请检查目录下的模型文件大小与网页上描述是否一致,如果可以的话,最好通过sha256校验模型是否下载完整
## 6. 第一次运行前的配置
在仓库根目录可以获得 [magic-pdf.template.json](../magic-pdf.template.json) 配置模版文件
> ❗️务必执行以下命令将配置文件拷贝到【用户目录】下,否则程序将无法运行
>
> windows用户目录为 "C:\Users\用户名"
```powershell
(New-Object System.Net.WebClient).DownloadFile('https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json', 'magic-pdf.template.json')
cp magic-pdf.template.json ~/magic-pdf.json
```
在用户目录中找到magic-pdf.json文件并配置"models-dir"为[5. 下载模型](#5-下载模型)中下载的模型权重文件所在目录 详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
> ❗️务必正确配置模型权重文件所在目录的【绝对路径】,否则会因为找不到模型文件而导致程序无法运行
> ## 6. 了解配置文件存放的位置
> windows系统中此路径应包含盘符,且需把路径中所有的`"\"`替换为`"/"`,否则会因为转义原因导致json文件语法错误。
> 完成[5.下载模型](#5-下载模型)步骤后,脚本会自动生成用户目录下的magic-pdf.json文件,并自动配置默认模型路径。
> 例如:模型放在D盘根目录的models目录,则model-dir的值应为"D:/models" 您可在【用户目录】下找到magic-pdf.json文件。
```json
{ > windows用户目录为 "C:/Users/用户名"
"models-dir": "/tmp/models"
}
```
## 7. 第一次运行 ## 7. 第一次运行
从仓库中下载样本文件,并测试 从仓库中下载样本文件,并测试
```powershell ```powershell
(New-Object System.Net.WebClient).DownloadFile('https://gitee.com/myhloli/MinerU/raw/master/demo/small_ocr.pdf', 'small_ocr.pdf') wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
magic-pdf -p small_ocr.pdf magic-pdf -p small_ocr.pdf
``` ```
## 8. 测试CUDA加速 ## 8. 测试CUDA加速
如果您的显卡显存大于等于8G,可以进行以下流程,测试CUDA解析加速效果
如果您的显卡显存大于等于 **8GB** ,可以进行以下流程,测试CUDA解析加速效果
**1.覆盖安装支持cuda的torch和torchvision** **1.覆盖安装支持cuda的torch和torchvision**
```bash ```bash
pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118 pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
``` ```
> ❗️务必在命令中指定以下版本 > ❗️务必在命令中指定以下版本
>
> ```bash > ```bash
> torch==2.3.1 torchvision==0.18.1 > torch==2.3.1 torchvision==0.18.1
> ``` > ```
>
> 这是我们支持的最高版本,如果不指定版本会自动安装更高版本导致程序无法运行 > 这是我们支持的最高版本,如果不指定版本会自动安装更高版本导致程序无法运行
**2.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值** **2.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值**
```json ```json
{ {
"device-mode":"cuda" "device-mode":"cuda"
} }
``` ```
**3.运行以下命令测试cuda加速效果** **3.运行以下命令测试cuda加速效果**
```bash ```bash
magic-pdf -p small_ocr.pdf magic-pdf -p small_ocr.pdf
``` ```
> 提示:CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断,通常情况下,`layout detection cost` 和 `mfr time` 应提速10倍以上。
> 提示:CUDA加速是否生效可以根据log中输出的各个阶段的耗时来简单判断,通常情况下,`layout detection time` 和 `mfr time` 应提速10倍以上。
## 9. 为ocr开启cuda加速 ## 9. 为ocr开启cuda加速
> ❗️以下操作需显卡显存大于等于16G才可进行,否则会因为显存不足导致程序崩溃或运行速度下降
**1.下载paddlepaddle-gpu, 安装完成后会自动开启ocr加速** **1.下载paddlepaddle-gpu, 安装完成后会自动开启ocr加速**
```bash ```bash
pip install paddlepaddle-gpu==2.6.1 pip install paddlepaddle-gpu==2.6.1
``` ```
**2.运行以下命令测试ocr加速效果** **2.运行以下命令测试ocr加速效果**
```bash ```bash
magic-pdf -p small_ocr.pdf magic-pdf -p small_ocr.pdf
``` ```
> 提示:CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断,通常情况下,`ocr cost`应提速10倍以上。
> 提示:CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断,通常情况下,`ocr time`应提速10倍以上。
# use modelscope sdk download models import json
import os
import requests
from modelscope import snapshot_download from modelscope import snapshot_download
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
print(f"model dir is: {model_dir}/models")
def download_json(url):
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
return response.json()
def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
config_version = data.get('config_version', '0.0.0')
if config_version < '1.0.0':
data = download_json(url)
else:
data = download_json(url)
# 修改内容
for key, value in modifications.items():
data[key] = value
# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
mineru_patterns = [
"models/Layout/LayoutLMv3/*",
"models/Layout/YOLO/*",
"models/MFD/YOLO/*",
"models/MFR/unimernet_small/*",
"models/TabRec/TableMaster/*",
"models/TabRec/StructEqTable/*",
]
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
model_dir = model_dir + '/models'
print(f'model_dir is: {model_dir}')
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json'
config_file_name = 'magic-pdf.json'
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}
download_and_modify_json(json_url, config_file, json_mods)
print(f'The configuration file has been configured successfully, the path is: {config_file}')
import json
import os
import requests
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
print(f"model dir is: {model_dir}/models")
def download_json(url):
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
return response.json()
def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
config_version = data.get('config_version', '0.0.0')
if config_version < '1.0.0':
data = download_json(url)
else:
data = download_json(url)
# 修改内容
for key, value in modifications.items():
data[key] = value
# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
mineru_patterns = [
"models/Layout/LayoutLMv3/*",
"models/Layout/YOLO/*",
"models/MFD/YOLO/*",
"models/MFR/unimernet_small/*",
"models/TabRec/TableMaster/*",
"models/TabRec/StructEqTable/*",
]
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
layoutreader_pattern = [
"*.json",
"*.safetensors",
]
layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
model_dir = model_dir + '/models'
print(f'model_dir is: {model_dir}')
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json'
config_file_name = 'magic-pdf.json'
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}
download_and_modify_json(json_url, config_file, json_mods)
print(f'The configuration file has been configured successfully, the path is: {config_file}')
Model downloads are divided into initial downloads and updates to the model directory. Please refer to the corresponding documentation for instructions on how to proceed. Model downloads are divided into initial downloads and updates to the model directory. Please refer to the corresponding documentation for instructions on how to proceed.
# Initial download of model files # Initial download of model files
### 1. Download the Model from Hugging Face ### 1. Download the Model from Hugging Face
Use a Python Script to Download Model Files from Hugging Face Use a Python Script to Download Model Files from Hugging Face
```bash ```bash
pip install huggingface_hub pip install huggingface_hub
wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py
python download_models_hf.py python download_models_hf.py
``` ```
After the Python script finishes executing, it will output the directory where the models are downloaded.
### 2. To modify the model path address in the configuration file The Python script will automatically download the model files and configure the model directory in the configuration file.
Additionally, in `~/magic-pdf.json`, update the model directory path to the absolute path of the `models` directory output by the previous Python script. Otherwise, you will encounter an error indicating that the model cannot be loaded.
The configuration file can be found in the user directory, with the filename `magic-pdf.json`.
# How to update models previously downloaded # How to update models previously downloaded
## 1. Models downloaded via Git LFS ## 1. Models downloaded via Git LFS
>Due to feedback from some users that downloading model files using git lfs was incomplete or resulted in corrupted model files, this method is no longer recommended. > Due to feedback from some users that downloading model files using git lfs was incomplete or resulted in corrupted model files, this method is no longer recommended.
When magic-pdf <= 0.8.1, if you have previously downloaded the model files via git lfs, you can navigate to the previous download directory and update the models using the `git pull` command.
If you previously downloaded model files via git lfs, you can navigate to the previous download directory and use the `git pull` command to update the model. > For versions 0.9.x and later, due to the repository change and the addition of the layout sorting model in PDF-Extract-Kit 1.0, the models cannot be updated using the `git pull` command. Instead, a Python script must be used for one-click updates.
## 2. Models downloaded via Hugging Face or Model Scope ## 2. Models downloaded via Hugging Face or Model Scope
......
...@@ -8,9 +8,8 @@ ...@@ -8,9 +8,8 @@
<summary>方法一:从 Hugging Face 下载模型</summary> <summary>方法一:从 Hugging Face 下载模型</summary>
<p>使用python脚本 从Hugging Face下载模型文件</p> <p>使用python脚本 从Hugging Face下载模型文件</p>
<pre><code>pip install huggingface_hub <pre><code>pip install huggingface_hub
wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models_hf.py wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py
python download_models_hf.py</code></pre> python download_models_hf.py</code></pre>
<p>python脚本执行完毕后,会输出模型下载目录</p>
</details> </details>
## 方法二:从 ModelScope 下载模型 ## 方法二:从 ModelScope 下载模型
...@@ -19,24 +18,26 @@ python download_models_hf.py</code></pre> ...@@ -19,24 +18,26 @@ python download_models_hf.py</code></pre>
```bash ```bash
pip install modelscope pip install modelscope
wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py -O download_models.py
python download_models.py python download_models.py
``` ```
python脚本执行完毕后,会输出模型下载目录
python脚本会自动下载模型文件并配置好配置文件中的模型目录
## 下载完成后的操作:修改magic-pdf.json中的模型路径 配置文件可以在用户目录中找到,文件名为`magic-pdf.json`
`~/magic-pdf.json`里修改模型的目录指向上一步脚本输出的models目录的绝对路径,否则会报模型无法加载的错误。
> windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名"
# 此前下载过模型,如何更新 # 此前下载过模型,如何更新
## 1. 通过git lfs下载过模型 ## 1. 通过git lfs下载过模型
>由于部分用户反馈通过git lfs下载模型文件遇到下载不全和模型文件损坏情况,现已不推荐使用该方式下载。 > 由于部分用户反馈通过git lfs下载模型文件遇到下载不全和模型文件损坏情况,现已不推荐使用该方式下载。
当magic-pdf <= 0.8.1时,如此前通过 git lfs 下载过模型文件,可以进入到之前的下载目录中,通过`git pull`命令更新模型。
> 0.9.x及以后版本由于PDF-Extract-Kit 1.0更换仓库和新增layout排序模型,不能通过`git pull`命令更新,需要使用python脚本一键更新。
如此前通过 git lfs 下载过模型文件,可以进入到之前的下载目录中,通过`git pull`命令更新模型。
## 2. 通过 Hugging Face 或 Model Scope 下载过模型 ## 2. 通过 Hugging Face 或 Model Scope 下载过模型
......
...@@ -4,10 +4,20 @@ ...@@ -4,10 +4,20 @@
"bucket-name-2":["ak", "sk", "endpoint"] "bucket-name-2":["ak", "sk", "endpoint"]
}, },
"models-dir":"/tmp/models", "models-dir":"/tmp/models",
"layoutreader-model-dir":"/tmp/layoutreader",
"device-mode":"cpu", "device-mode":"cpu",
"layout-config": {
"model": "layoutlmv3"
},
"formula-config": {
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
},
"table-config": { "table-config": {
"model": "TableMaster", "model": "tablemaster",
"is_table_recog_enable": false, "enable": false,
"max_time": 400 "max_time": 400
} },
"config_version": "1.0.0"
} }
\ No newline at end of file
import enum
class SupportedPdfParseMethod(enum.Enum):
OCR = 'ocr'
TXT = 'txt'
class FileNotExisted(Exception):
def __init__(self, path):
self.path = path
def __str__(self):
return f'File {self.path} does not exist.'
class InvalidConfig(Exception):
def __init__(self, msg):
self.msg = msg
def __str__(self):
return f'Invalid config: {self.msg}'
class InvalidParams(Exception):
def __init__(self, msg):
self.msg = msg
def __str__(self):
return f'Invalid params: {self.msg}'
class EmptyData(Exception):
def __init__(self, msg):
self.msg = msg
def __str__(self):
return f'Empty data: {self.msg}'
from magic_pdf.data.data_reader_writer.filebase import \
FileBasedDataReader # noqa: F401
from magic_pdf.data.data_reader_writer.filebase import \
FileBasedDataWriter # noqa: F401
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \
MultiBucketS3DataReader # noqa: F401
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \
MultiBucketS3DataWriter # noqa: F401
from magic_pdf.data.data_reader_writer.s3 import S3DataReader # noqa: F401
from magic_pdf.data.data_reader_writer.s3 import S3DataWriter # noqa: F401
from magic_pdf.data.data_reader_writer.base import DataReader # noqa: F401
from magic_pdf.data.data_reader_writer.base import DataWriter # noqa: F401
\ No newline at end of file
from abc import ABC, abstractmethod
class DataReader(ABC):
def read(self, path: str) -> bytes:
"""Read the file.
Args:
path (str): file path to read
Returns:
bytes: the content of the file
"""
return self.read_at(path)
@abstractmethod
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read the file at offset and limit.
Args:
path (str): the file path
offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the length of bytes want to read. Defaults to -1.
Returns:
bytes: the content of the file
"""
pass
class DataWriter(ABC):
@abstractmethod
def write(self, path: str, data: bytes) -> None:
"""Write the data to the file.
Args:
path (str): the target file where to write
data (bytes): the data want to write
"""
pass
def write_string(self, path: str, data: str) -> None:
"""Write the data to file, the data will be encoded to bytes.
Args:
path (str): the target file where to write
data (str): the data want to write
"""
self.write(path, data.encode())
import os
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
class FileBasedDataReader(DataReader):
def __init__(self, parent_dir: str = ''):
"""Initialized with parent_dir.
Args:
parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''.
"""
self._parent_dir = parent_dir
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read at offset and limit.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the length of bytes want to read. Defaults to -1.
Returns:
bytes: the content of file
"""
fn_path = path
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
fn_path = os.path.join(self._parent_dir, path)
with open(fn_path, 'rb') as f:
f.seek(offset)
if limit == -1:
return f.read()
else:
return f.read(limit)
class FileBasedDataWriter(DataWriter):
def __init__(self, parent_dir: str = '') -> None:
"""Initialized with parent_dir.
Args:
parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''.
"""
self._parent_dir = parent_dir
def write(self, path: str, data: bytes) -> None:
"""Write file with data.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to write
"""
fn_path = path
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
fn_path = os.path.join(self._parent_dir, path)
with open(fn_path, 'wb') as f:
f.write(data)
from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
from magic_pdf.data.io.s3 import S3Reader, S3Writer
from magic_pdf.data.schemas import S3Config
from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
remove_non_official_s3_args)
class MultiS3Mixin:
def __init__(self, default_bucket: str, s3_configs: list[S3Config]):
"""Initialized with multiple s3 configs.
Args:
default_bucket (str): the default bucket name of the relative path
s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.
Raises:
InvalidConfig: default bucket config not in s3_configs
InvalidConfig: bucket name not unique in s3_configs
InvalidConfig: default bucket must be provided
"""
if len(default_bucket) == 0:
raise InvalidConfig('default_bucket must be provided')
found_default_bucket_config = False
for conf in s3_configs:
if conf.bucket_name == default_bucket:
found_default_bucket_config = True
break
if not found_default_bucket_config:
raise InvalidConfig(
f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}'
)
uniq_bucket = set([conf.bucket_name for conf in s3_configs])
if len(uniq_bucket) != len(s3_configs):
raise InvalidConfig(
f'the bucket_name in s3_configs: {s3_configs} must be unique'
)
self.default_bucket = default_bucket
self.s3_configs = s3_configs
self._s3_clients_h: dict = {}
class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
def read(self, path: str) -> bytes:
"""Read the path from s3, select diffect bucket client for each request
based on the path, also support range read.
Args:
path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit
for example: s3://bucket_name/path?0,100
Returns:
bytes: the content of s3 file
"""
may_range_params = parse_s3_range_params(path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_len = 0, -1
else:
byte_start, byte_len = int(may_range_params[0]), int(may_range_params[1])
path = remove_non_official_s3_args(path)
return self.read_at(path, byte_start, byte_len)
def __get_s3_client(self, bucket_name: str):
if bucket_name not in set([conf.bucket_name for conf in self.s3_configs]):
raise InvalidParams(
f'bucket name: {bucket_name} not found in s3_configs: {self.s3_configs}'
)
if bucket_name not in self._s3_clients_h:
conf = next(
filter(lambda conf: conf.bucket_name == bucket_name, self.s3_configs)
)
self._s3_clients_h[bucket_name] = S3Reader(
bucket_name,
conf.access_key,
conf.secret_key,
conf.endpoint_url,
conf.addressing_style,
)
return self._s3_clients_h[bucket_name]
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read the file with offset and limit, select diffect bucket client
for each request based on the path.
Args:
path (str): the file path
offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.
Returns:
bytes: the file content
"""
if path.startswith('s3://'):
bucket_name, path = parse_s3path(path)
s3_reader = self.__get_s3_client(bucket_name)
else:
s3_reader = self.__get_s3_client(self.default_bucket)
return s3_reader.read_at(path, offset, limit)
class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
def __get_s3_client(self, bucket_name: str):
if bucket_name not in set([conf.bucket_name for conf in self.s3_configs]):
raise InvalidParams(
f'bucket name: {bucket_name} not found in s3_configs: {self.s3_configs}'
)
if bucket_name not in self._s3_clients_h:
conf = next(
filter(lambda conf: conf.bucket_name == bucket_name, self.s3_configs)
)
self._s3_clients_h[bucket_name] = S3Writer(
bucket_name,
conf.access_key,
conf.secret_key,
conf.endpoint_url,
conf.addressing_style,
)
return self._s3_clients_h[bucket_name]
def write(self, path: str, data: bytes) -> None:
"""Write file with data, also select diffect bucket client for each
request based on the path.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to write
"""
if path.startswith('s3://'):
bucket_name, path = parse_s3path(path)
s3_writer = self.__get_s3_client(bucket_name)
else:
s3_writer = self.__get_s3_client(self.default_bucket)
return s3_writer.write(path, data)
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import (
MultiBucketS3DataReader, MultiBucketS3DataWriter)
from magic_pdf.data.schemas import S3Config
class S3DataReader(MultiBucketS3DataReader):
def __init__(
self,
bucket: str,
ak: str,
sk: str,
endpoint_url: str,
addressing_style: str = 'auto',
):
"""s3 reader client.
Args:
bucket (str): bucket name
ak (str): access key
sk (str): secret key
endpoint_url (str): endpoint url of s3
addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
"""
super().__init__(
bucket,
[
S3Config(
bucket_name=bucket,
access_key=ak,
secret_key=sk,
endpoint_url=endpoint_url,
addressing_style=addressing_style,
)
],
)
class S3DataWriter(MultiBucketS3DataWriter):
def __init__(
self,
bucket: str,
ak: str,
sk: str,
endpoint_url: str,
addressing_style: str = 'auto',
):
"""s3 writer client.
Args:
bucket (str): bucket name
ak (str): access key
sk (str): secret key
endpoint_url (str): endpoint url of s3
addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
"""
super().__init__(
bucket,
[
S3Config(
bucket_name=bucket,
access_key=ak,
secret_key=sk,
endpoint_url=endpoint_url,
addressing_style=addressing_style,
)
],
)
from abc import ABC, abstractmethod
from typing import Iterator
import fitz
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.schemas import PageInfo
from magic_pdf.data.utils import fitz_doc_to_image
class PageableData(ABC):
@abstractmethod
def get_image(self) -> dict:
"""Transform data to image."""
pass
@abstractmethod
def get_doc(self) -> fitz.Page:
"""Get the pymudoc page."""
pass
@abstractmethod
def get_page_info(self) -> PageInfo:
"""Get the page info of the page.
Returns:
PageInfo: the page info of this page
"""
pass
class Dataset(ABC):
@abstractmethod
def __len__(self) -> int:
"""The length of the dataset."""
pass
@abstractmethod
def __iter__(self) -> Iterator[PageableData]:
"""Yield the page data."""
pass
@abstractmethod
def supported_methods(self) -> list[SupportedPdfParseMethod]:
"""The methods that this dataset support.
Returns:
list[SupportedPdfParseMethod]: The supported methods, Valid methods are: OCR, TXT
"""
pass
@abstractmethod
def data_bits(self) -> bytes:
"""The bits used to create this dataset."""
pass
@abstractmethod
def get_page(self, page_id: int) -> PageableData:
"""Get the page indexed by page_id.
Args:
page_id (int): the index of the page
Returns:
PageableData: the page doc object
"""
pass
class PymuDocDataset(Dataset):
def __init__(self, bits: bytes):
"""Initialize the dataset, which wraps the pymudoc documents.
Args:
bits (bytes): the bytes of the pdf
"""
self._records = [Doc(v) for v in fitz.open('pdf', bits)]
self._data_bits = bits
self._raw_data = bits
def __len__(self) -> int:
"""The page number of the pdf."""
return len(self._records)
def __iter__(self) -> Iterator[PageableData]:
"""Yield the page doc object."""
return iter(self._records)
def supported_methods(self) -> list[SupportedPdfParseMethod]:
"""The method supported by this dataset.
Returns:
list[SupportedPdfParseMethod]: the supported methods
"""
return [SupportedPdfParseMethod.OCR, SupportedPdfParseMethod.TXT]
def data_bits(self) -> bytes:
"""The pdf bits used to create this dataset."""
return self._data_bits
def get_page(self, page_id: int) -> PageableData:
"""The page doc object.
Args:
page_id (int): the page doc index
Returns:
PageableData: the page doc object
"""
return self._records[page_id]
class ImageDataset(Dataset):
def __init__(self, bits: bytes):
"""Initialize the dataset, which wraps the pymudoc documents.
Args:
bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
"""
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
self._records = [Doc(v) for v in fitz.open('pdf', pdf_bytes)]
self._raw_data = bits
self._data_bits = pdf_bytes
def __len__(self) -> int:
"""The length of the dataset."""
return len(self._records)
def __iter__(self) -> Iterator[PageableData]:
"""Yield the page object."""
return iter(self._records)
def supported_methods(self):
"""The method supported by this dataset.
Returns:
list[SupportedPdfParseMethod]: the supported methods
"""
return [SupportedPdfParseMethod.OCR]
def data_bits(self) -> bytes:
"""The pdf bits used to create this dataset."""
return self._data_bits
def get_page(self, page_id: int) -> PageableData:
"""The page doc object.
Args:
page_id (int): the page doc index
Returns:
PageableData: the page doc object
"""
return self._records[page_id]
class Doc(PageableData):
"""Initialized with pymudoc object."""
def __init__(self, doc: fitz.Page):
self._doc = doc
def get_image(self):
"""Return the imge info.
Returns:
dict: {
img: np.ndarray,
width: int,
height: int
}
"""
return fitz_doc_to_image(self._doc)
def get_doc(self) -> fitz.Page:
"""Get the pymudoc object.
Returns:
fitz.Page: the pymudoc object
"""
return self._doc
def get_page_info(self) -> PageInfo:
"""Get the page info of the page.
Returns:
PageInfo: the page info of this page
"""
page_w = self._doc.rect.width
page_h = self._doc.rect.height
return PageInfo(w=page_w, h=page_h)
def __getattr__(self, name):
if hasattr(self._doc, name):
return getattr(self._doc, name)
from abc import ABC, abstractmethod
class IOReader(ABC):
@abstractmethod
def read(self, path: str) -> bytes:
"""Read the file.
Args:
path (str): file path to read
Returns:
bytes: the content of the file
"""
pass
@abstractmethod
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read at offset and limit.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the length of bytes want to read. Defaults to -1.
Returns:
bytes: the content of file
"""
pass
class IOWriter:
@abstractmethod
def write(self, path: str, data: bytes) -> None:
"""Write file with data.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to write
"""
pass
import io
import requests
from magic_pdf.data.io.base import IOReader, IOWriter
class HttpReader(IOReader):
def read(self, url: str) -> bytes:
"""Read the file.
Args:
path (str): file path to read
Returns:
bytes: the content of the file
"""
return requests.get(url).content
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Not Implemented."""
raise NotImplementedError
class HttpWriter(IOWriter):
def write(self, url: str, data: bytes) -> None:
"""Write file with data.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to write
"""
files = {'file': io.BytesIO(data)}
response = requests.post(url, files=files)
assert 300 > response.status_code and response.status_code > 199
import boto3
from botocore.config import Config
from magic_pdf.data.io.base import IOReader, IOWriter
class S3Reader(IOReader):
def __init__(
self,
bucket: str,
ak: str,
sk: str,
endpoint_url: str,
addressing_style: str = 'auto',
):
"""s3 reader client.
Args:
bucket (str): bucket name
ak (str): access key
sk (str): secret key
endpoint_url (str): endpoint url of s3
addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
"""
self._bucket = bucket
self._ak = ak
self._sk = sk
self._s3_client = boto3.client(
service_name='s3',
aws_access_key_id=ak,
aws_secret_access_key=sk,
endpoint_url=endpoint_url,
config=Config(
s3={'addressing_style': addressing_style},
retries={'max_attempts': 5, 'mode': 'standard'},
),
)
def read(self, key: str) -> bytes:
"""Read the file.
Args:
path (str): file path to read
Returns:
bytes: the content of the file
"""
return self.read_at(key)
def read_at(self, key: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read at offset and limit.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the length of bytes want to read. Defaults to -1.
Returns:
bytes: the content of file
"""
if limit > -1:
range_header = f'bytes={offset}-{offset+limit-1}'
res = self._s3_client.get_object(
Bucket=self._bucket, Key=key, Range=range_header
)
else:
res = self._s3_client.get_object(
Bucket=self._bucket, Key=key, Range=f'bytes={offset}-'
)
return res['Body'].read()
class S3Writer(IOWriter):
def __init__(
self,
bucket: str,
ak: str,
sk: str,
endpoint_url: str,
addressing_style: str = 'auto',
):
"""s3 reader client.
Args:
bucket (str): bucket name
ak (str): access key
sk (str): secret key
endpoint_url (str): endpoint url of s3
addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
"""
self._bucket = bucket
self._ak = ak
self._sk = sk
self._s3_client = boto3.client(
service_name='s3',
aws_access_key_id=ak,
aws_secret_access_key=sk,
endpoint_url=endpoint_url,
config=Config(
s3={'addressing_style': addressing_style},
retries={'max_attempts': 5, 'mode': 'standard'},
),
)
def write(self, key: str, data: bytes):
"""Write file with data.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to write
"""
self._s3_client.put_object(Bucket=self._bucket, Key=key, Body=data)
import json
import os
from pathlib import Path
from magic_pdf.config.exceptions import EmptyData, InvalidParams
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
MultiBucketS3DataReader)
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
def read_jsonl(
s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
) -> list[PymuDocDataset]:
"""Read the jsonl file and return the list of PymuDocDataset.
Args:
s3_path_or_local (str): local file or s3 path
s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
Raises:
InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
EmptyData: if no pdf file location is provided in some line of jsonl file.
InvalidParams: if the file location is s3 path but s3_client is not provided
Returns:
list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
"""
bits_arr = []
if s3_path_or_local.startswith('s3://'):
if s3_client is None:
raise InvalidParams('s3_client is required when s3_path is provided')
jsonl_bits = s3_client.read(s3_path_or_local)
else:
jsonl_bits = FileBasedDataReader('').read(s3_path_or_local)
jsonl_d = [
json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
]
for d in jsonl_d[:5]:
pdf_path = d.get('file_location', '') or d.get('path', '')
if len(pdf_path) == 0:
raise EmptyData('pdf file location is empty')
if pdf_path.startswith('s3://'):
if s3_client is None:
raise InvalidParams('s3_client is required when s3_path is provided')
bits_arr.append(s3_client.read(pdf_path))
else:
bits_arr.append(FileBasedDataReader('').read(pdf_path))
return [PymuDocDataset(bits) for bits in bits_arr]
def read_local_pdfs(path: str) -> list[PymuDocDataset]:
"""Read pdf from path or directory.
Args:
path (str): pdf file path or directory that contains pdf files
Returns:
list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
"""
if os.path.isdir(path):
reader = FileBasedDataReader(path)
return [
PymuDocDataset(reader.read(doc_path.name))
for doc_path in Path(path).glob('*.pdf')
]
else:
reader = FileBasedDataReader()
bits = reader.read(path)
return [PymuDocDataset(bits)]
def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
"""Read images from path or directory.
Args:
path (str): image file path or directory that contains image files
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
Returns:
list[ImageDataset]: each image file will converted to a ImageDataset
"""
if os.path.isdir(path):
imgs_bits = []
s_suffixes = set(suffixes)
reader = FileBasedDataReader(path)
for root, _, files in os.walk(path):
for file in files:
suffix = file.split('.')
if suffix[-1] in s_suffixes:
imgs_bits.append(reader.read(file))
return [ImageDataset(bits) for bits in imgs_bits]
else:
reader = FileBasedDataReader()
bits = reader.read(path)
return [ImageDataset(bits)]
from pydantic import BaseModel, Field
class S3Config(BaseModel):
bucket_name: str = Field(description='s3 bucket name', min_length=1)
access_key: str = Field(description='s3 access key', min_length=1)
secret_key: str = Field(description='s3 secret key', min_length=1)
endpoint_url: str = Field(description='s3 endpoint url', min_length=1)
addressing_style: str = Field(description='s3 addressing style', default='auto', min_length=1)
class PageInfo(BaseModel):
w: float = Field(description='the width of page')
h: float = Field(description='the height of page')
import fitz
import numpy as np
from magic_pdf.utils.annotations import ImportPIL
@ImportPIL
def fitz_doc_to_image(doc, dpi=200) -> dict:
"""Convert fitz.Document to image, Then convert the image to numpy array.
Args:
doc (_type_): pymudoc page
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
Returns:
dict: {'img': numpy array, 'width': width, 'height': height }
"""
from PIL import Image
mat = fitz.Matrix(dpi / 72, dpi / 72)
pm = doc.get_pixmap(matrix=mat, alpha=False)
# If the width or height exceeds 9000 after scaling, do not scale further.
if pm.width > 9000 or pm.height > 9000:
pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
img = np.array(img)
img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
return img_dict
This diff is collapsed.
...@@ -10,18 +10,12 @@ block维度自定义字段 ...@@ -10,18 +10,12 @@ block维度自定义字段
# block中lines是否被删除 # block中lines是否被删除
LINES_DELETED = "lines_deleted" LINES_DELETED = "lines_deleted"
# struct eqtable
STRUCT_EQTABLE = "struct_eqtable"
# table recognition max time default value # table recognition max time default value
TABLE_MAX_TIME_VALUE = 400 TABLE_MAX_TIME_VALUE = 400
# pp_table_result_max_length # pp_table_result_max_length
TABLE_MAX_LEN = 480 TABLE_MAX_LEN = 480
# pp table structure algorithm
TABLE_MASTER = "TableMaster"
# table master structure dict # table master structure dict
TABLE_MASTER_DICT = "table_master_structure_dict.txt" TABLE_MASTER_DICT = "table_master_structure_dict.txt"
...@@ -38,3 +32,16 @@ REC_MODEL_DIR = "ch_PP-OCRv3_rec_infer" ...@@ -38,3 +32,16 @@ REC_MODEL_DIR = "ch_PP-OCRv3_rec_infer"
REC_CHAR_DICT = "ppocr_keys_v1.txt" REC_CHAR_DICT = "ppocr_keys_v1.txt"
class MODEL_NAME:
# pp table structure algorithm
TABLE_MASTER = "tablemaster"
# struct eqtable
STRUCT_EQTABLE = "struct_eqtable"
DocLayout_YOLO = "doclayout_yolo"
LAYOUTLMv3 = "layoutlmv3"
YOLO_V8_MFD = "yolo_v8_mfd"
UniMerNet_v2_Small = "unimernet_small"
\ No newline at end of file
...@@ -445,3 +445,38 @@ def get_overlap_area(bbox1, bbox2): ...@@ -445,3 +445,38 @@ def get_overlap_area(bbox1, bbox2):
# The area of overlap area # The area of overlap area
return (x_right - x_left) * (y_bottom - y_top) return (x_right - x_left) * (y_bottom - y_top)
def calculate_vertical_projection_overlap_ratio(block1, block2):
"""
Calculate the proportion of the x-axis covered by the vertical projection of two blocks.
Args:
block1 (tuple): Coordinates of the first block (x0, y0, x1, y1).
block2 (tuple): Coordinates of the second block (x0, y0, x1, y1).
Returns:
float: The proportion of the x-axis covered by the vertical projection of the two blocks.
"""
x0_1, _, x1_1, _ = block1
x0_2, _, x1_2, _ = block2
# Calculate the intersection of the x-coordinates
x_left = max(x0_1, x0_2)
x_right = min(x1_1, x1_2)
if x_right < x_left:
return 0.0
# Length of the intersection
intersection_length = x_right - x_left
# Length of the x-axis projection of the first block
block1_length = x1_1 - x0_1
if block1_length == 0:
return 0.0
# Proportion of the x-axis covered by the intersection
# logger.info(f"intersection_length: {intersection_length}, block1_length: {block1_length}")
return intersection_length / block1_length
""" """根据bucket的名字返回对应的s3 AK, SK,endpoint三元组."""
根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
"""
import json import json
import os import os
from loguru import logger from loguru import logger
from magic_pdf.libs.Constants import MODEL_NAME
from magic_pdf.libs.commons import parse_bucket_key from magic_pdf.libs.commons import parse_bucket_key
# 定义配置文件名常量 # 定义配置文件名常量
CONFIG_FILE_NAME = "magic-pdf.json" CONFIG_FILE_NAME = os.getenv('MINERU_TOOLS_CONFIG_JSON', 'magic-pdf.json')
def read_config(): def read_config():
home_dir = os.path.expanduser("~") if os.path.isabs(CONFIG_FILE_NAME):
config_file = CONFIG_FILE_NAME
config_file = os.path.join(home_dir, CONFIG_FILE_NAME) else:
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
if not os.path.exists(config_file): if not os.path.exists(config_file):
raise FileNotFoundError(f"{config_file} not found") raise FileNotFoundError(f'{config_file} not found')
with open(config_file, "r", encoding="utf-8") as f: with open(config_file, 'r', encoding='utf-8') as f:
config = json.load(f) config = json.load(f)
return config return config
def get_s3_config(bucket_name: str): def get_s3_config(bucket_name: str):
""" """~/magic-pdf.json 读出来."""
~/magic-pdf.json 读出来
"""
config = read_config() config = read_config()
bucket_info = config.get("bucket_info") bucket_info = config.get('bucket_info')
if bucket_name not in bucket_info: if bucket_name not in bucket_info:
access_key, secret_key, storage_endpoint = bucket_info["[default]"] access_key, secret_key, storage_endpoint = bucket_info['[default]']
else: else:
access_key, secret_key, storage_endpoint = bucket_info[bucket_name] access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
if access_key is None or secret_key is None or storage_endpoint is None: if access_key is None or secret_key is None or storage_endpoint is None:
raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}") raise Exception(f'ak, sk or endpoint not found in {CONFIG_FILE_NAME}')
# logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}") # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
...@@ -49,7 +47,7 @@ def get_s3_config(bucket_name: str): ...@@ -49,7 +47,7 @@ def get_s3_config(bucket_name: str):
def get_s3_config_dict(path: str): def get_s3_config_dict(path: str):
access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path)) access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
return {"ak": access_key, "sk": secret_key, "endpoint": storage_endpoint} return {'ak': access_key, 'sk': secret_key, 'endpoint': storage_endpoint}
def get_bucket_name(path): def get_bucket_name(path):
...@@ -59,33 +57,65 @@ def get_bucket_name(path): ...@@ -59,33 +57,65 @@ def get_bucket_name(path):
def get_local_models_dir(): def get_local_models_dir():
config = read_config() config = read_config()
models_dir = config.get("models-dir") models_dir = config.get('models-dir')
if models_dir is None: if models_dir is None:
logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default") logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
return "/tmp/models" return '/tmp/models'
else: else:
return models_dir return models_dir
def get_local_layoutreader_model_dir():
config = read_config()
layoutreader_model_dir = config.get('layoutreader-model-dir')
if layoutreader_model_dir is None or not os.path.exists(layoutreader_model_dir):
home_dir = os.path.expanduser('~')
layoutreader_at_modelscope_dir_path = os.path.join(home_dir, '.cache/modelscope/hub/ppaanngggg/layoutreader')
logger.warning(f"'layoutreader-model-dir' not exists, use {layoutreader_at_modelscope_dir_path} as default")
return layoutreader_at_modelscope_dir_path
else:
return layoutreader_model_dir
def get_device(): def get_device():
config = read_config() config = read_config()
device = config.get("device-mode") device = config.get('device-mode')
if device is None: if device is None:
logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default") logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
return "cpu" return 'cpu'
else: else:
return device return device
def get_table_recog_config(): def get_table_recog_config():
config = read_config() config = read_config()
table_config = config.get("table-config") table_config = config.get('table-config')
if table_config is None: if table_config is None:
logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default") logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
return json.loads('{"is_table_recog_enable": false, "max_time": 400}') return json.loads(f'{{"model": "{MODEL_NAME.TABLE_MASTER}","enable": false, "max_time": 400}}')
else: else:
return table_config return table_config
def get_layout_config():
config = read_config()
layout_config = config.get("layout-config")
if layout_config is None:
logger.warning(f"'layout-config' not found in {CONFIG_FILE_NAME}, use '{MODEL_NAME.LAYOUTLMv3}' as default")
return json.loads(f'{{"model": "{MODEL_NAME.LAYOUTLMv3}"}}')
else:
return layout_config
def get_formula_config():
config = read_config()
formula_config = config.get("formula-config")
if formula_config is None:
logger.warning(f"'formula-config' not found in {CONFIG_FILE_NAME}, use 'True' as default")
return json.loads(f'{{"mfd_model": "{MODEL_NAME.YOLO_V8_MFD}","mfr_model": "{MODEL_NAME.UniMerNet_v2_Small}","enable": true}}')
else:
return formula_config
if __name__ == "__main__": if __name__ == "__main__":
ak, sk, endpoint = get_s3_config("llm-raw") ak, sk, endpoint = get_s3_config("llm-raw")
This diff is collapsed.
...@@ -20,6 +20,8 @@ class BlockType: ...@@ -20,6 +20,8 @@ class BlockType:
InterlineEquation = 'interline_equation' InterlineEquation = 'interline_equation'
Footnote = 'footnote' Footnote = 'footnote'
Discarded = 'discarded' Discarded = 'discarded'
List = 'list'
Index = 'index'
class CategoryId: class CategoryId:
......
...@@ -4,7 +4,9 @@ import fitz ...@@ -4,7 +4,9 @@ import fitz
import numpy as np import numpy as np
from loguru import logger from loguru import logger
from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config from magic_pdf.libs.clean_memory import clean_memory
from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config, get_layout_config, \
get_formula_config
from magic_pdf.model.model_list import MODEL from magic_pdf.model.model_list import MODEL
import magic_pdf.model as model_config import magic_pdf.model as model_config
...@@ -23,7 +25,7 @@ def remove_duplicates_dicts(lst): ...@@ -23,7 +25,7 @@ def remove_duplicates_dicts(lst):
return unique_dicts return unique_dicts
def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list: def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
try: try:
from PIL import Image from PIL import Image
except ImportError: except ImportError:
...@@ -32,18 +34,28 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list: ...@@ -32,18 +34,28 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
images = [] images = []
with fitz.open("pdf", pdf_bytes) as doc: with fitz.open("pdf", pdf_bytes) as doc:
pdf_page_num = doc.page_count
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
if end_page_id > pdf_page_num - 1:
logger.warning("end_page_id is out of range, use images length")
end_page_id = pdf_page_num - 1
for index in range(0, doc.page_count): for index in range(0, doc.page_count):
page = doc[index] if start_page_id <= index <= end_page_id:
mat = fitz.Matrix(dpi / 72, dpi / 72) page = doc[index]
pm = page.get_pixmap(matrix=mat, alpha=False) mat = fitz.Matrix(dpi / 72, dpi / 72)
pm = page.get_pixmap(matrix=mat, alpha=False)
# If the width or height exceeds 9000 after scaling, do not scale further.
if pm.width > 9000 or pm.height > 9000:
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
# If the width or height exceeds 9000 after scaling, do not scale further. img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
if pm.width > 9000 or pm.height > 9000: img = np.array(img)
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) img_dict = {"img": img, "width": pm.width, "height": pm.height}
else:
img_dict = {"img": [], "width": 0, "height": 0}
img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
img = np.array(img)
img_dict = {"img": img, "width": pm.width, "height": pm.height}
images.append(img_dict) images.append(img_dict)
return images return images
...@@ -57,14 +69,17 @@ class ModelSingleton: ...@@ -57,14 +69,17 @@ class ModelSingleton:
cls._instance = super().__new__(cls) cls._instance = super().__new__(cls)
return cls._instance return cls._instance
def get_model(self, ocr: bool, show_log: bool, lang=None): def get_model(self, ocr: bool, show_log: bool, lang=None, layout_model=None, formula_enable=None, table_enable=None):
key = (ocr, show_log, lang) key = (ocr, show_log, lang, layout_model, formula_enable, table_enable)
if key not in self._models: if key not in self._models:
self._models[key] = custom_model_init(ocr=ocr, show_log=show_log, lang=lang) self._models[key] = custom_model_init(ocr=ocr, show_log=show_log, lang=lang, layout_model=layout_model,
formula_enable=formula_enable, table_enable=table_enable)
return self._models[key] return self._models[key]
def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None): def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None,
layout_model=None, formula_enable=None, table_enable=None):
model = None model = None
if model_config.__model_mode__ == "lite": if model_config.__model_mode__ == "lite":
...@@ -84,14 +99,30 @@ def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None): ...@@ -84,14 +99,30 @@ def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None):
# 从配置文件读取model-dir和device # 从配置文件读取model-dir和device
local_models_dir = get_local_models_dir() local_models_dir = get_local_models_dir()
device = get_device() device = get_device()
layout_config = get_layout_config()
if layout_model is not None:
layout_config["model"] = layout_model
formula_config = get_formula_config()
if formula_enable is not None:
formula_config["enable"] = formula_enable
table_config = get_table_recog_config() table_config = get_table_recog_config()
model_input = {"ocr": ocr, if table_enable is not None:
"show_log": show_log, table_config["enable"] = table_enable
"models_dir": local_models_dir,
"device": device, model_input = {
"table_config": table_config, "ocr": ocr,
"lang": lang, "show_log": show_log,
} "models_dir": local_models_dir,
"device": device,
"table_config": table_config,
"layout_config": layout_config,
"formula_config": formula_config,
"lang": lang,
}
custom_model = CustomPEKModel(**model_input) custom_model = CustomPEKModel(**model_input)
else: else:
logger.error("Not allow model_name!") logger.error("Not allow model_name!")
...@@ -106,19 +137,23 @@ def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None): ...@@ -106,19 +137,23 @@ def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None):
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
start_page_id=0, end_page_id=None, lang=None): start_page_id=0, end_page_id=None, lang=None,
layout_model=None, formula_enable=None, table_enable=None):
model_manager = ModelSingleton() if lang == "":
custom_model = model_manager.get_model(ocr, show_log, lang) lang = None
images = load_images_from_pdf(pdf_bytes) model_manager = ModelSingleton()
custom_model = model_manager.get_model(ocr, show_log, lang, layout_model, formula_enable, table_enable)
# end_page_id = end_page_id if end_page_id else len(images) - 1 with fitz.open("pdf", pdf_bytes) as doc:
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(images) - 1 pdf_page_num = doc.page_count
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
if end_page_id > pdf_page_num - 1:
logger.warning("end_page_id is out of range, use images length")
end_page_id = pdf_page_num - 1
if end_page_id > len(images) - 1: images = load_images_from_pdf(pdf_bytes, start_page_id=start_page_id, end_page_id=end_page_id)
logger.warning("end_page_id is out of range, use images length")
end_page_id = len(images) - 1
model_json = [] model_json = []
doc_analyze_start = time.time() doc_analyze_start = time.time()
...@@ -135,6 +170,11 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, ...@@ -135,6 +170,11 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
page_dict = {"layout_dets": result, "page_info": page_info} page_dict = {"layout_dets": result, "page_info": page_info}
model_json.append(page_dict) model_json.append(page_dict)
gc_start = time.time()
clean_memory()
gc_time = round(time.time() - gc_start, 2)
logger.info(f"gc time: {gc_time}")
doc_analyze_time = round(time.time() - doc_analyze_start, 2) doc_analyze_time = round(time.time() - doc_analyze_start, 2)
doc_analyze_speed = round( (end_page_id + 1 - start_page_id) / doc_analyze_time, 2) doc_analyze_speed = round( (end_page_id + 1 - start_page_id) / doc_analyze_time, 2)
logger.info(f"doc analyze time: {round(time.time() - doc_analyze_start, 2)}," logger.info(f"doc analyze time: {round(time.time() - doc_analyze_start, 2)},"
......
This diff is collapsed.
This diff is collapsed.
...@@ -52,11 +52,11 @@ class ppTableModel(object): ...@@ -52,11 +52,11 @@ class ppTableModel(object):
rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR) rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)
rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT) rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)
device = kwargs.get("device", "cpu") device = kwargs.get("device", "cpu")
use_gpu = True if device == "cuda" else False use_gpu = True if device.startswith("cuda") else False
config = { config = {
"use_gpu": use_gpu, "use_gpu": use_gpu,
"table_max_len": kwargs.get("table_max_len", TABLE_MAX_LEN), "table_max_len": kwargs.get("table_max_len", TABLE_MAX_LEN),
"table_algorithm": TABLE_MASTER, "table_algorithm": "TableMaster",
"table_model_dir": table_model_dir, "table_model_dir": table_model_dir,
"table_char_dict_path": table_char_dict_path, "table_char_dict_path": table_char_dict_path,
"det_model_dir": det_model_dir, "det_model_dir": det_model_dir,
......
This diff is collapsed.
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
...@@ -8,10 +10,11 @@ def parse_pdf_by_ocr(pdf_bytes, ...@@ -8,10 +10,11 @@ def parse_pdf_by_ocr(pdf_bytes,
end_page_id=None, end_page_id=None,
debug_mode=False, debug_mode=False,
): ):
return pdf_parse_union(pdf_bytes, dataset = PymuDocDataset(pdf_bytes)
return pdf_parse_union(dataset,
model_list, model_list,
imageWriter, imageWriter,
"ocr", SupportedPdfParseMethod.OCR,
start_page_id=start_page_id, start_page_id=start_page_id,
end_page_id=end_page_id, end_page_id=end_page_id,
debug_mode=debug_mode, debug_mode=debug_mode,
......
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
...@@ -9,10 +11,11 @@ def parse_pdf_by_txt( ...@@ -9,10 +11,11 @@ def parse_pdf_by_txt(
end_page_id=None, end_page_id=None,
debug_mode=False, debug_mode=False,
): ):
return pdf_parse_union(pdf_bytes, dataset = PymuDocDataset(pdf_bytes)
return pdf_parse_union(dataset,
model_list, model_list,
imageWriter, imageWriter,
"txt", SupportedPdfParseMethod.TXT,
start_page_id=start_page_id, start_page_id=start_page_id,
end_page_id=end_page_id, end_page_id=end_page_id,
debug_mode=debug_mode, debug_mode=debug_mode,
......
This diff is collapsed.
...@@ -17,7 +17,7 @@ class AbsPipe(ABC): ...@@ -17,7 +17,7 @@ class AbsPipe(ABC):
PIP_TXT = "txt" PIP_TXT = "txt"
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False, def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None, lang=None): start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
self.pdf_bytes = pdf_bytes self.pdf_bytes = pdf_bytes
self.model_list = model_list self.model_list = model_list
self.image_writer = image_writer self.image_writer = image_writer
...@@ -26,6 +26,9 @@ class AbsPipe(ABC): ...@@ -26,6 +26,9 @@ class AbsPipe(ABC):
self.start_page_id = start_page_id self.start_page_id = start_page_id
self.end_page_id = end_page_id self.end_page_id = end_page_id
self.lang = lang self.lang = lang
self.layout_model = layout_model
self.formula_enable = formula_enable
self.table_enable = table_enable
def get_compress_pdf_mid_data(self): def get_compress_pdf_mid_data(self):
return JsonCompressor.compress_json(self.pdf_mid_data) return JsonCompressor.compress_json(self.pdf_mid_data)
...@@ -95,9 +98,7 @@ class AbsPipe(ABC): ...@@ -95,9 +98,7 @@ class AbsPipe(ABC):
""" """
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data) pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
pdf_info_list = pdf_mid_data["pdf_info"] pdf_info_list = pdf_mid_data["pdf_info"]
parse_type = pdf_mid_data["_parse_type"] content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
lang = pdf_mid_data.get("_lang", None)
content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path, parse_type, lang)
return content_list return content_list
@staticmethod @staticmethod
...@@ -107,9 +108,7 @@ class AbsPipe(ABC): ...@@ -107,9 +108,7 @@ class AbsPipe(ABC):
""" """
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data) pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
pdf_info_list = pdf_mid_data["pdf_info"] pdf_info_list = pdf_mid_data["pdf_info"]
parse_type = pdf_mid_data["_parse_type"] md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
lang = pdf_mid_data.get("_lang", None)
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path, parse_type, lang)
return md_content return md_content
...@@ -10,8 +10,10 @@ from magic_pdf.user_api import parse_ocr_pdf ...@@ -10,8 +10,10 @@ from magic_pdf.user_api import parse_ocr_pdf
class OCRPipe(AbsPipe): class OCRPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False, def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None, lang=None): start_page_id=0, end_page_id=None, lang=None,
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang) layout_model=None, formula_enable=None, table_enable=None):
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
layout_model, formula_enable, table_enable)
def pipe_classify(self): def pipe_classify(self):
pass pass
...@@ -19,12 +21,14 @@ class OCRPipe(AbsPipe): ...@@ -19,12 +21,14 @@ class OCRPipe(AbsPipe):
def pipe_analyze(self): def pipe_analyze(self):
self.model_list = doc_analyze(self.pdf_bytes, ocr=True, self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
start_page_id=self.start_page_id, end_page_id=self.end_page_id, start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang) lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
def pipe_parse(self): def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug, self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
start_page_id=self.start_page_id, end_page_id=self.end_page_id, start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang) lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
result = super().pipe_mk_uni_format(img_parent_path, drop_mode) result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
......
...@@ -11,8 +11,10 @@ from magic_pdf.user_api import parse_txt_pdf ...@@ -11,8 +11,10 @@ from magic_pdf.user_api import parse_txt_pdf
class TXTPipe(AbsPipe): class TXTPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False, def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None, lang=None): start_page_id=0, end_page_id=None, lang=None,
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang) layout_model=None, formula_enable=None, table_enable=None):
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
layout_model, formula_enable, table_enable)
def pipe_classify(self): def pipe_classify(self):
pass pass
...@@ -20,12 +22,14 @@ class TXTPipe(AbsPipe): ...@@ -20,12 +22,14 @@ class TXTPipe(AbsPipe):
def pipe_analyze(self): def pipe_analyze(self):
self.model_list = doc_analyze(self.pdf_bytes, ocr=False, self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
start_page_id=self.start_page_id, end_page_id=self.end_page_id, start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang) lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
def pipe_parse(self): def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug, self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
start_page_id=self.start_page_id, end_page_id=self.end_page_id, start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang) lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
result = super().pipe_mk_uni_format(img_parent_path, drop_mode) result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
......
...@@ -14,9 +14,11 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf ...@@ -14,9 +14,11 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class UNIPipe(AbsPipe): class UNIPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False, def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None, lang=None): start_page_id=0, end_page_id=None, lang=None,
layout_model=None, formula_enable=None, table_enable=None):
self.pdf_type = jso_useful_key["_pdf_type"] self.pdf_type = jso_useful_key["_pdf_type"]
super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id, lang) super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id,
lang, layout_model, formula_enable, table_enable)
if len(self.model_list) == 0: if len(self.model_list) == 0:
self.input_model_is_empty = True self.input_model_is_empty = True
else: else:
...@@ -29,18 +31,21 @@ class UNIPipe(AbsPipe): ...@@ -29,18 +31,21 @@ class UNIPipe(AbsPipe):
if self.pdf_type == self.PIP_TXT: if self.pdf_type == self.PIP_TXT:
self.model_list = doc_analyze(self.pdf_bytes, ocr=False, self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
start_page_id=self.start_page_id, end_page_id=self.end_page_id, start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang) lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
elif self.pdf_type == self.PIP_OCR: elif self.pdf_type == self.PIP_OCR:
self.model_list = doc_analyze(self.pdf_bytes, ocr=True, self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
start_page_id=self.start_page_id, end_page_id=self.end_page_id, start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang) lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
def pipe_parse(self): def pipe_parse(self):
if self.pdf_type == self.PIP_TXT: if self.pdf_type == self.PIP_TXT:
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer, self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty, is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
start_page_id=self.start_page_id, end_page_id=self.end_page_id, start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang) lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
elif self.pdf_type == self.PIP_OCR: elif self.pdf_type == self.PIP_OCR:
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug, is_debug=self.is_debug,
......
from loguru import logger from loguru import logger
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \ from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
calculate_iou calculate_iou, calculate_vertical_projection_overlap_ratio
from magic_pdf.libs.drop_tag import DropTag from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import BlockType from magic_pdf.libs.ocr_content_type import BlockType
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
...@@ -60,29 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc ...@@ -60,29 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
return all_bboxes, all_discarded_blocks, drop_reasons return all_bboxes, all_discarded_blocks, drop_reasons
def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_blocks, text_blocks, def add_bboxes(blocks, block_type, bboxes):
title_blocks, interline_equation_blocks, page_w, page_h): for block in blocks:
all_bboxes = [] x0, y0, x1, y1 = block['bbox']
all_discarded_blocks = [] if block_type in [
for image in img_blocks: BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
x0, y0, x1, y1 = image['bbox'] BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]]) ]:
bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"], block["group_id"]])
else:
bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"]])
for table in table_blocks:
x0, y0, x1, y1 = table['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
for text in text_blocks: def ocr_prepare_bboxes_for_layout_split_v2(
x0, y0, x1, y1 = text['bbox'] img_body_blocks, img_caption_blocks, img_footnote_blocks,
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]]) table_body_blocks, table_caption_blocks, table_footnote_blocks,
discarded_blocks, text_blocks, title_blocks, interline_equation_blocks, page_w, page_h
):
all_bboxes = []
for title in title_blocks: add_bboxes(img_body_blocks, BlockType.ImageBody, all_bboxes)
x0, y0, x1, y1 = title['bbox'] add_bboxes(img_caption_blocks, BlockType.ImageCaption, all_bboxes)
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]]) add_bboxes(img_footnote_blocks, BlockType.ImageFootnote, all_bboxes)
add_bboxes(table_body_blocks, BlockType.TableBody, all_bboxes)
for interline_equation in interline_equation_blocks: add_bboxes(table_caption_blocks, BlockType.TableCaption, all_bboxes)
x0, y0, x1, y1 = interline_equation['bbox'] add_bboxes(table_footnote_blocks, BlockType.TableFootnote, all_bboxes)
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]]) add_bboxes(text_blocks, BlockType.Text, all_bboxes)
add_bboxes(title_blocks, BlockType.Title, all_bboxes)
add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
'''block嵌套问题解决''' '''block嵌套问题解决'''
'''文本框与标题框重叠,优先信任文本框''' '''文本框与标题框重叠,优先信任文本框'''
...@@ -96,23 +101,47 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b ...@@ -96,23 +101,47 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框''' '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
# 通过后续大框套小框逻辑删除 # 通过后续大框套小框逻辑删除
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)''' '''discarded_blocks'''
all_discarded_blocks = []
add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
'''footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的'''
footnote_blocks = []
for discarded in discarded_blocks: for discarded in discarded_blocks:
x0, y0, x1, y1 = discarded['bbox'] x0, y0, x1, y1 = discarded['bbox']
all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]]) if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
# 将footnote加入到all_bboxes中,用来计算layout footnote_blocks.append([x0, y0, x1, y1])
# if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
# all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]]) '''移除在footnote下面的任何框'''
need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
if len(need_remove_blocks) > 0:
for block in need_remove_blocks:
all_bboxes.remove(block)
all_discarded_blocks.append(block)
'''经过以上处理后,还存在大框套小框的情况,则删除小框''' '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes = remove_overlaps_min_blocks(all_bboxes) all_bboxes = remove_overlaps_min_blocks(all_bboxes)
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks) all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
'''将剩余的bbox做分离处理,防止后面分layout时出错''' '''将剩余的bbox做分离处理,防止后面分layout时出错'''
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes) all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
return all_bboxes, all_discarded_blocks return all_bboxes, all_discarded_blocks
def find_blocks_under_footnote(all_bboxes, footnote_blocks):
need_remove_blocks = []
for block in all_bboxes:
block_x0, block_y0, block_x1, block_y1 = block[:4]
for footnote_bbox in footnote_blocks:
footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
# 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
if block_y0 >= footnote_y1 and calculate_vertical_projection_overlap_ratio((block_x0, block_y0, block_x1, block_y1), footnote_bbox) >= 0.8:
if block not in need_remove_blocks:
need_remove_blocks.append(block)
break
return need_remove_blocks
def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes): def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
# 先提取所有text和interline block # 先提取所有text和interline block
text_blocks = [] text_blocks = []
......
...@@ -49,8 +49,7 @@ def merge_spans_to_line(spans): ...@@ -49,8 +49,7 @@ def merge_spans_to_line(spans):
continue continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行 # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.5):
current_line[-1]['bbox']):
current_line.append(span) current_line.append(span)
else: else:
# 否则,开始新行 # 否则,开始新行
...@@ -154,6 +153,11 @@ def fill_spans_in_blocks(blocks, spans, radio): ...@@ -154,6 +153,11 @@ def fill_spans_in_blocks(blocks, spans, radio):
'type': block_type, 'type': block_type,
'bbox': block_bbox, 'bbox': block_bbox,
} }
if block_type in [
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
]:
block_dict["group_id"] = block[-1]
block_spans = [] block_spans = []
for span in spans: for span in spans:
span_bbox = span['bbox'] span_bbox = span['bbox']
...@@ -202,6 +206,27 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks): ...@@ -202,6 +206,27 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
return fix_blocks return fix_blocks
def fix_block_spans_v2(block_with_spans):
"""1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
fix_blocks = []
for block in block_with_spans:
block_type = block['type']
if block_type in [BlockType.Text, BlockType.Title,
BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableCaption, BlockType.TableFootnote
]:
block = fix_text_block(block)
elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
block = fix_interline_block(block)
else:
continue
fix_blocks.append(block)
return fix_blocks
def fix_discarded_block(discarded_block_with_spans): def fix_discarded_block(discarded_block_with_spans):
fix_discarded_blocks = [] fix_discarded_blocks = []
for block in discarded_block_with_spans: for block in discarded_block_with_spans:
......
config:
device: cpu
layout: True
formula: True
table_config:
model: TableMaster
is_table_recog_enable: False
max_time: 400
weights: weights:
layout: Layout/model_final.pth layoutlmv3: Layout/LayoutLMv3/model_final.pth
mfd: MFD/weights.pt doclayout_yolo: Layout/YOLO/doclayout_yolo_ft.pt
mfr: MFR/unimernet_small yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
unimernet_small: MFR/unimernet_small
struct_eqtable: TabRec/StructEqTable struct_eqtable: TabRec/StructEqTable
TableMaster: TabRec/TableMaster tablemaster: TabRec/TableMaster
\ No newline at end of file \ No newline at end of file
...@@ -52,7 +52,7 @@ without method specified, auto will be used by default.""", ...@@ -52,7 +52,7 @@ without method specified, auto will be used by default.""",
help=""" help="""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional. Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
You should input "Abbreviation" with language form url: You should input "Abbreviation" with language form url:
https://paddlepaddle.github.io/PaddleOCR/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
""", """,
default=None, default=None,
) )
......
...@@ -6,8 +6,8 @@ import click ...@@ -6,8 +6,8 @@ import click
from loguru import logger from loguru import logger
import magic_pdf.model as model_config import magic_pdf.model as model_config
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox, from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
draw_model_bbox, draw_line_sort_bbox) draw_model_bbox, draw_span_bbox)
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe from magic_pdf.pipe.TXTPipe import TXTPipe
...@@ -46,10 +46,12 @@ def do_parse( ...@@ -46,10 +46,12 @@ def do_parse(
start_page_id=0, start_page_id=0,
end_page_id=None, end_page_id=None,
lang=None, lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
): ):
if debug_able: if debug_able:
logger.warning('debug mode is on') logger.warning('debug mode is on')
# f_dump_content_list = True
f_draw_model_bbox = True f_draw_model_bbox = True
f_draw_line_sort_bbox = True f_draw_line_sort_bbox = True
...@@ -64,13 +66,16 @@ def do_parse( ...@@ -64,13 +66,16 @@ def do_parse(
if parse_method == 'auto': if parse_method == 'auto':
jso_useful_key = {'_pdf_type': '', 'model_list': model_list} jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True, pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id, lang=lang) start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
elif parse_method == 'txt': elif parse_method == 'txt':
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True, pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id, lang=lang) start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
elif parse_method == 'ocr': elif parse_method == 'ocr':
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True, pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id, lang=lang) start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
else: else:
logger.error('unknown parse method') logger.error('unknown parse method')
exit(1) exit(1)
......
...@@ -101,11 +101,19 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr ...@@ -101,11 +101,19 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False): if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr") logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
if input_model_is_empty: if input_model_is_empty:
pdf_models = doc_analyze(pdf_bytes, layout_model = kwargs.get("layout_model", None)
ocr=True, formula_enable = kwargs.get("formula_enable", None)
start_page_id=start_page_id, table_enable = kwargs.get("table_enable", None)
end_page_id=end_page_id, pdf_models = doc_analyze(
lang=lang) pdf_bytes,
ocr=True,
start_page_id=start_page_id,
end_page_id=end_page_id,
lang=lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pdf_info_dict = parse_pdf(parse_pdf_by_ocr) pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None: if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.") raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
......
from loguru import logger
def ImportPIL(f):
try:
import PIL # noqa: F401
except ImportError:
logger.error('Pillow not installed, please install by pip.')
exit(1)
return f
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.10"
formats:
- epub
python:
install:
- requirements: docs/requirements.txt
sphinx:
configuration: docs/en/conf.py
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
Data Api
------------------
.. toctree::
:maxdepth: 2
api/dataset.rst
api/data_reader_writer.rst
api/read_api.rst
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment