Commit ca737d5c authored by myhloli's avatar myhloli

Merge remote-tracking branch 'origin/master'

parents f8599d2b 3fc9943f
......@@ -121,11 +121,14 @@ pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
下载后请将models目录移动到空间较大的ssd磁盘目录
#### 3. 拷贝配置文件并进行配置
在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 文件
在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 配置模版文件
> ❗️务必执行以下命令将配置文件拷贝到用户目录下,否则程序将无法运行
```bash
cp magic-pdf.template.json ~/magic-pdf.json
```
在magic-pdf.json中配置"models-dir"为模型权重文件所在目录
在用户目录中找到magic-pdf.json文件并配置"models-dir"为[2. 下载模型权重文件](#2-下载模型权重文件)中下载的模型权重文件所在目录
> ❗️务必正确配置模型权重文件所在目录,否则会因为找不到模型文件而导致程序无法运行
```json
{
"models-dir": "/tmp/models"
......
......@@ -22,6 +22,7 @@ pip install magic-pdf[full-cpu]
pip install magic-pdf
pip install unimernet==0.1.0
pip install matplotlib ultralytics paddleocr==2.7.3 paddlepaddle
pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
```
### 4.在部分较新的M芯片macOS设备上,MPS加速开启失败
......@@ -83,3 +84,18 @@ model_json 指的是通过模型分析后生成的一种有特定格式的json
如果使用 https://github.com/opendatalab/PDF-Extract-Kit 项目生成,该文件一般在项目的output目录下。
如果使用 MinerU 的命令行调用内置的模型分析,该文件一般在输出路径"/tmp/magic-pdf/pdf-name"下。
参考:https://github.com/opendatalab/MinerU/issues/128
### 10.Linux下报错:Required dependency not installed, please install by "pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"
这种情况可以先使用pip list 检查一下自己的依赖库列表,重点确认下以下几个库有没有安装(版本不一定完全一致,有就可以)
```bash
opencv-contrib-python 4.6.0.66
opencv-python 4.6.0.66
opencv-python-headless 4.10.0.84
paddleocr 2.7.3
paddlepaddle 2.6.1
torch 2.2.2
torchtext 0.17.2
torchvision 0.17.2
```
如果都有的话,可能是libgl库没有安装,参考 https://github.com/opendatalab/MinerU/issues/165#issuecomment-2245202282 安装libgl库后再试试能不能正常使用。
......@@ -89,7 +89,6 @@ def do_parse(
orig_model_list = copy.deepcopy(model_list)
local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
logger.info(f"local output dir is {local_md_dir}")
image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
image_dir = str(os.path.basename(local_image_dir))
......@@ -163,6 +162,7 @@ def do_parse(
path=f"{pdf_file_name}_content_list.json",
mode=AbsReaderWriter.MODE_TXT,
)
logger.info(f"local output dir is '{local_md_dir}', you can found the result in it.")
@click.group()
......
......@@ -10,14 +10,17 @@ from loguru import logger
from magic_pdf.libs.commons import parse_bucket_key
# 定义配置文件名常量
CONFIG_FILE_NAME = "magic-pdf.json"
def read_config():
home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, "magic-pdf.json")
config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
if not os.path.exists(config_file):
raise Exception(f"{config_file} not found")
raise FileNotFoundError(f"{config_file} not found")
with open(config_file, "r") as f:
config = json.load(f)
......@@ -37,7 +40,7 @@ def get_s3_config(bucket_name: str):
access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
if access_key is None or secret_key is None or storage_endpoint is None:
raise Exception("ak, sk or endpoint not found in magic-pdf.json")
raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}")
# logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
......@@ -56,17 +59,32 @@ def get_bucket_name(path):
def get_local_dir():
config = read_config()
return config.get("temp-output-dir", "/tmp")
local_dir = config.get("temp-output-dir")
if local_dir is None:
logger.warning(f"'temp-output-dir' not found in {CONFIG_FILE_NAME}, use '/tmp' as default")
return "/tmp"
else:
return local_dir
def get_local_models_dir():
config = read_config()
return config.get("models-dir", "/tmp/models")
models_dir = config.get("models-dir")
if models_dir is None:
logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
return "/tmp/models"
else:
return models_dir
def get_device():
config = read_config()
return config.get("device-mode", "cpu")
device = config.get("device-mode")
if device is None:
logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
return "cpu"
else:
return device
if __name__ == "__main__":
......
from loguru import logger
import os
import time
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
try:
import cv2
import yaml
......@@ -17,14 +19,17 @@ try:
import unimernet.tasks as tasks
from unimernet.processors import load_processor
from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
except ImportError as e:
logger.exception(e)
logger.error('Required dependency not installed, please install by \n"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
logger.error(
'Required dependency not installed, please install by \n'
'"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
exit(1)
from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
def mfd_model_init(weight):
mfd_model = YOLO(weight)
......@@ -100,6 +105,7 @@ class CustomPEKModel:
self.device = kwargs.get("device", self.configs["config"]["device"])
logger.info("using device: {}".format(self.device))
models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
logger.info("using models_dir: {}".format(models_dir))
# 初始化公式识别
if self.apply_formula:
......@@ -135,6 +141,7 @@ class CustomPEKModel:
layout_cost = round(time.time() - layout_start, 2)
logger.info(f"layout detection cost: {layout_cost}")
if self.apply_formula:
# 公式检测
mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
......@@ -189,8 +196,8 @@ class CustomPEKModel:
paste_x = 50
paste_y = 50
# 创建一个宽高各多50的白色背景
new_width = xmax - xmin + paste_x*2
new_height = ymax - ymin + paste_y*2
new_width = xmax - xmin + paste_x * 2
new_height = ymax - ymin + paste_y * 2
new_image = Image.new('RGB', (new_width, new_height), 'white')
# 裁剪图像
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment