Commit ca737d5c authored by myhloli's avatar myhloli

Merge remote-tracking branch 'origin/master'

parents f8599d2b 3fc9943f
...@@ -121,11 +121,14 @@ pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/ ...@@ -121,11 +121,14 @@ pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
下载后请将models目录移动到空间较大的ssd磁盘目录 下载后请将models目录移动到空间较大的ssd磁盘目录
#### 3. 拷贝配置文件并进行配置 #### 3. 拷贝配置文件并进行配置
在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 文件 在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 配置模版文件
> ❗️务必执行以下命令将配置文件拷贝到用户目录下,否则程序将无法运行
```bash ```bash
cp magic-pdf.template.json ~/magic-pdf.json cp magic-pdf.template.json ~/magic-pdf.json
``` ```
在magic-pdf.json中配置"models-dir"为模型权重文件所在目录
在用户目录中找到magic-pdf.json文件并配置"models-dir"为[2. 下载模型权重文件](#2-下载模型权重文件)中下载的模型权重文件所在目录
> ❗️务必正确配置模型权重文件所在目录,否则会因为找不到模型文件而导致程序无法运行
```json ```json
{ {
"models-dir": "/tmp/models" "models-dir": "/tmp/models"
......
...@@ -22,6 +22,7 @@ pip install magic-pdf[full-cpu] ...@@ -22,6 +22,7 @@ pip install magic-pdf[full-cpu]
pip install magic-pdf pip install magic-pdf
pip install unimernet==0.1.0 pip install unimernet==0.1.0
pip install matplotlib ultralytics paddleocr==2.7.3 paddlepaddle pip install matplotlib ultralytics paddleocr==2.7.3 paddlepaddle
pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
``` ```
### 4.在部分较新的M芯片macOS设备上,MPS加速开启失败 ### 4.在部分较新的M芯片macOS设备上,MPS加速开启失败
...@@ -83,3 +84,18 @@ model_json 指的是通过模型分析后生成的一种有特定格式的json ...@@ -83,3 +84,18 @@ model_json 指的是通过模型分析后生成的一种有特定格式的json
如果使用 https://github.com/opendatalab/PDF-Extract-Kit 项目生成,该文件一般在项目的output目录下。 如果使用 https://github.com/opendatalab/PDF-Extract-Kit 项目生成,该文件一般在项目的output目录下。
如果使用 MinerU 的命令行调用内置的模型分析,该文件一般在输出路径"/tmp/magic-pdf/pdf-name"下。 如果使用 MinerU 的命令行调用内置的模型分析,该文件一般在输出路径"/tmp/magic-pdf/pdf-name"下。
参考:https://github.com/opendatalab/MinerU/issues/128 参考:https://github.com/opendatalab/MinerU/issues/128
### 10.Linux下报错:Required dependency not installed, please install by "pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"
这种情况可以先使用pip list 检查一下自己的依赖库列表,重点确认下以下几个库有没有安装(版本不一定完全一致,有就可以)
```bash
opencv-contrib-python 4.6.0.66
opencv-python 4.6.0.66
opencv-python-headless 4.10.0.84
paddleocr 2.7.3
paddlepaddle 2.6.1
torch 2.2.2
torchtext 0.17.2
torchvision 0.17.2
```
如果都有的话,可能是libgl库没有安装,参考 https://github.com/opendatalab/MinerU/issues/165#issuecomment-2245202282 安装libgl库后再试试能不能正常使用。
...@@ -89,7 +89,6 @@ def do_parse( ...@@ -89,7 +89,6 @@ def do_parse(
orig_model_list = copy.deepcopy(model_list) orig_model_list = copy.deepcopy(model_list)
local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method) local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
logger.info(f"local output dir is {local_md_dir}")
image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir) image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
image_dir = str(os.path.basename(local_image_dir)) image_dir = str(os.path.basename(local_image_dir))
...@@ -163,6 +162,7 @@ def do_parse( ...@@ -163,6 +162,7 @@ def do_parse(
path=f"{pdf_file_name}_content_list.json", path=f"{pdf_file_name}_content_list.json",
mode=AbsReaderWriter.MODE_TXT, mode=AbsReaderWriter.MODE_TXT,
) )
logger.info(f"local output dir is '{local_md_dir}', you can found the result in it.")
@click.group() @click.group()
......
...@@ -10,14 +10,17 @@ from loguru import logger ...@@ -10,14 +10,17 @@ from loguru import logger
from magic_pdf.libs.commons import parse_bucket_key from magic_pdf.libs.commons import parse_bucket_key
# 定义配置文件名常量
CONFIG_FILE_NAME = "magic-pdf.json"
def read_config(): def read_config():
home_dir = os.path.expanduser("~") home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, "magic-pdf.json") config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
if not os.path.exists(config_file): if not os.path.exists(config_file):
raise Exception(f"{config_file} not found") raise FileNotFoundError(f"{config_file} not found")
with open(config_file, "r") as f: with open(config_file, "r") as f:
config = json.load(f) config = json.load(f)
...@@ -37,7 +40,7 @@ def get_s3_config(bucket_name: str): ...@@ -37,7 +40,7 @@ def get_s3_config(bucket_name: str):
access_key, secret_key, storage_endpoint = bucket_info[bucket_name] access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
if access_key is None or secret_key is None or storage_endpoint is None: if access_key is None or secret_key is None or storage_endpoint is None:
raise Exception("ak, sk or endpoint not found in magic-pdf.json") raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}")
# logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}") # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
...@@ -56,17 +59,32 @@ def get_bucket_name(path): ...@@ -56,17 +59,32 @@ def get_bucket_name(path):
def get_local_dir(): def get_local_dir():
config = read_config() config = read_config()
return config.get("temp-output-dir", "/tmp") local_dir = config.get("temp-output-dir")
if local_dir is None:
logger.warning(f"'temp-output-dir' not found in {CONFIG_FILE_NAME}, use '/tmp' as default")
return "/tmp"
else:
return local_dir
def get_local_models_dir(): def get_local_models_dir():
config = read_config() config = read_config()
return config.get("models-dir", "/tmp/models") models_dir = config.get("models-dir")
if models_dir is None:
logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
return "/tmp/models"
else:
return models_dir
def get_device(): def get_device():
config = read_config() config = read_config()
return config.get("device-mode", "cpu") device = config.get("device-mode")
if device is None:
logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
return "cpu"
else:
return device
if __name__ == "__main__": if __name__ == "__main__":
......
from loguru import logger from loguru import logger
import os import os
import time import time
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
try: try:
import cv2 import cv2
import yaml import yaml
...@@ -17,14 +19,17 @@ try: ...@@ -17,14 +19,17 @@ try:
import unimernet.tasks as tasks import unimernet.tasks as tasks
from unimernet.processors import load_processor from unimernet.processors import load_processor
from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
except ImportError as e: except ImportError as e:
logger.exception(e) logger.exception(e)
logger.error('Required dependency not installed, please install by \n"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"') logger.error(
'Required dependency not installed, please install by \n'
'"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
exit(1) exit(1)
from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
def mfd_model_init(weight): def mfd_model_init(weight):
mfd_model = YOLO(weight) mfd_model = YOLO(weight)
...@@ -100,6 +105,7 @@ class CustomPEKModel: ...@@ -100,6 +105,7 @@ class CustomPEKModel:
self.device = kwargs.get("device", self.configs["config"]["device"]) self.device = kwargs.get("device", self.configs["config"]["device"])
logger.info("using device: {}".format(self.device)) logger.info("using device: {}".format(self.device))
models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models")) models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
logger.info("using models_dir: {}".format(models_dir))
# 初始化公式识别 # 初始化公式识别
if self.apply_formula: if self.apply_formula:
...@@ -135,6 +141,7 @@ class CustomPEKModel: ...@@ -135,6 +141,7 @@ class CustomPEKModel:
layout_cost = round(time.time() - layout_start, 2) layout_cost = round(time.time() - layout_start, 2)
logger.info(f"layout detection cost: {layout_cost}") logger.info(f"layout detection cost: {layout_cost}")
if self.apply_formula:
# 公式检测 # 公式检测
mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0] mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()): for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
...@@ -189,8 +196,8 @@ class CustomPEKModel: ...@@ -189,8 +196,8 @@ class CustomPEKModel:
paste_x = 50 paste_x = 50
paste_y = 50 paste_y = 50
# 创建一个宽高各多50的白色背景 # 创建一个宽高各多50的白色背景
new_width = xmax - xmin + paste_x*2 new_width = xmax - xmin + paste_x * 2
new_height = ymax - ymin + paste_y*2 new_height = ymax - ymin + paste_y * 2
new_image = Image.new('RGB', (new_width, new_height), 'white') new_image = Image.new('RGB', (new_width, new_height), 'white')
# 裁剪图像 # 裁剪图像
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment