Merge remote-tracking branch 'origin/master'

ca737d5c · myhloli · f8599d2b · 3fc9943f · ca737d5c · ca737d5c
Commit ca737d5c authored Jul 24, 2024 by myhloli
5 changed files
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -121,11 +121,14 @@ pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
 下载后请将models目录移动到空间较大的ssd磁盘目录  

 #### 3. 拷贝配置文件并进行配置
-在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 文件
+在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 配置模版文件
+> ❗️务必执行以下命令将配置文件拷贝到用户目录下，否则程序将无法运行
 ```bash
 cp magic-pdf.template.json ~/magic-pdf.json
 ```
-在magic-pdf.json中配置"models-dir"为模型权重文件所在目录
+
+在用户目录中找到magic-pdf.json文件并配置"models-dir"为[2. 下载模型权重文件](#2-下载模型权重文件)中下载的模型权重文件所在目录
+> ❗️务必正确配置模型权重文件所在目录，否则会因为找不到模型文件而导致程序无法运行
 ```json
 {
  "models-dir": "/tmp/models"

--- a/docs/FAQ_zh_cn.md
+++ b/docs/FAQ_zh_cn.md
@@ -22,6 +22,7 @@ pip install magic-pdf[full-cpu]
 pip install magic-pdf
 pip install unimernet==0.1.0
 pip install matplotlib ultralytics paddleocr==2.7.3 paddlepaddle
+pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/ 
 ```

 ### 4.在部分较新的M芯片macOS设备上，MPS加速开启失败
@@ -82,4 +83,19 @@ pip install paddlepaddle-gpu
 model_json 指的是通过模型分析后生成的一种有特定格式的json文件。  
 如果使用 https://github.com/opendatalab/PDF-Extract-Kit 项目生成，该文件一般在项目的output目录下。  
 如果使用 MinerU 的命令行调用内置的模型分析，该文件一般在输出路径"/tmp/magic-pdf/pdf-name"下。  
-参考：https://github.com/opendatalab/MinerU/issues/128
\ No newline at end of file
+参考：https://github.com/opendatalab/MinerU/issues/128
+
+### 10.Linux下报错：Required dependency not installed, please install by "pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"
+
+这种情况可以先使用pip list 检查一下自己的依赖库列表，重点确认下以下几个库有没有安装（版本不一定完全一致，有就可以）
+```bash
+opencv-contrib-python     4.6.0.66
+opencv-python             4.6.0.66
+opencv-python-headless    4.10.0.84
+paddleocr                 2.7.3
+paddlepaddle              2.6.1
+torch                     2.2.2
+torchtext                 0.17.2
+torchvision               0.17.2
+```
+如果都有的话，可能是libgl库没有安装，参考 https://github.com/opendatalab/MinerU/issues/165#issuecomment-2245202282 安装libgl库后再试试能不能正常使用。
--- a/magic_pdf/cli/magicpdf.py
+++ b/magic_pdf/cli/magicpdf.py
@@ -89,7 +89,6 @@ def do_parse(
    orig_model_list = copy.deepcopy(model_list)

    local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
-    logger.info(f"local output dir is {local_md_dir}")
    image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
    image_dir = str(os.path.basename(local_image_dir))

@@ -163,6 +162,7 @@ def do_parse(
            path=f"{pdf_file_name}_content_list.json",
            mode=AbsReaderWriter.MODE_TXT,
        )
+    logger.info(f"local output dir is '{local_md_dir}', you can found the result in it.")


 @click.group()

--- a/magic_pdf/libs/config_reader.py
+++ b/magic_pdf/libs/config_reader.py
@@ -10,14 +10,17 @@ from loguru import logger

 from magic_pdf.libs.commons import parse_bucket_key

+# 定义配置文件名常量
+CONFIG_FILE_NAME = "magic-pdf.json"
+

 def read_config():
    home_dir = os.path.expanduser("~")

-    config_file = os.path.join(home_dir, "magic-pdf.json")
+    config_file = os.path.join(home_dir, CONFIG_FILE_NAME)

    if not os.path.exists(config_file):
-        raise Exception(f"{config_file} not found")
+        raise FileNotFoundError(f"{config_file} not found")

    with open(config_file, "r") as f:
        config = json.load(f)
@@ -37,7 +40,7 @@ def get_s3_config(bucket_name: str):
        access_key, secret_key, storage_endpoint = bucket_info[bucket_name]

    if access_key is None or secret_key is None or storage_endpoint is None:
-        raise Exception("ak, sk or endpoint not found in magic-pdf.json")
+        raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}")

    # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")

@@ -56,17 +59,32 @@ def get_bucket_name(path):

 def get_local_dir():
    config = read_config()
-    return config.get("temp-output-dir", "/tmp")
+    local_dir = config.get("temp-output-dir")
+    if local_dir is None:
+        logger.warning(f"'temp-output-dir' not found in {CONFIG_FILE_NAME}, use '/tmp' as default")
+        return "/tmp"
+    else:
+        return local_dir


 def get_local_models_dir():
    config = read_config()
-    return config.get("models-dir", "/tmp/models")
+    models_dir = config.get("models-dir")
+    if models_dir is None:
+        logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
+        return "/tmp/models"
+    else:
+        return models_dir


 def get_device():
    config = read_config()
-    return config.get("device-mode", "cpu")
+    device = config.get("device-mode")
+    if device is None:
+        logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
+        return "cpu"
+    else:
+        return device


 if __name__ == "__main__":

--- a/magic_pdf/model/pdf_extract_kit.py
+++ b/magic_pdf/model/pdf_extract_kit.py
 from loguru import logger
 import os
 import time
+
+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 try:
    import cv2
    import yaml
@@ -17,14 +19,17 @@ try:
    import unimernet.tasks as tasks
    from unimernet.processors import load_processor

-    from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
-    from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
-    from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
 except ImportError as e:
    logger.exception(e)
-    logger.error('Required dependency not installed, please install by \n"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
+    logger.error(
+        'Required dependency not installed, please install by \n'
+        '"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
    exit(1)

+from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
+from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
+from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
+

 def mfd_model_init(weight):
    mfd_model = YOLO(weight)
@@ -100,6 +105,7 @@ class CustomPEKModel:
        self.device = kwargs.get("device", self.configs["config"]["device"])
        logger.info("using device: {}".format(self.device))
        models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
+        logger.info("using models_dir: {}".format(models_dir))

        # 初始化公式识别
        if self.apply_formula:
@@ -135,34 +141,35 @@ class CustomPEKModel:
        layout_cost = round(time.time() - layout_start, 2)
        logger.info(f"layout detection cost: {layout_cost}")

-        # 公式检测
-        mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
-        for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
-            xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
-            new_item = {
-                'category_id': 13 + int(cla.item()),
-                'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
-                'score': round(float(conf.item()), 2),
-                'latex': '',
-            }
-            layout_res.append(new_item)
-            latex_filling_list.append(new_item)
-            bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
-            mf_image_list.append(bbox_img)
-
-        # 公式识别
-        mfr_start = time.time()
-        dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
-        dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
-        mfr_res = []
-        for mf_img in dataloader:
-            mf_img = mf_img.to(self.device)
-            output = self.mfr_model.generate({'image': mf_img})
-            mfr_res.extend(output['pred_str'])
-        for res, latex in zip(latex_filling_list, mfr_res):
-            res['latex'] = latex_rm_whitespace(latex)
-        mfr_cost = round(time.time() - mfr_start, 2)
-        logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
+        if self.apply_formula:
+            # 公式检测
+            mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
+            for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
+                xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
+                new_item = {
+                    'category_id': 13 + int(cla.item()),
+                    'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
+                    'score': round(float(conf.item()), 2),
+                    'latex': '',
+                }
+                layout_res.append(new_item)
+                latex_filling_list.append(new_item)
+                bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
+                mf_image_list.append(bbox_img)
+
+            # 公式识别
+            mfr_start = time.time()
+            dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
+            dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
+            mfr_res = []
+            for mf_img in dataloader:
+                mf_img = mf_img.to(self.device)
+                output = self.mfr_model.generate({'image': mf_img})
+                mfr_res.extend(output['pred_str'])
+            for res, latex in zip(latex_filling_list, mfr_res):
+                res['latex'] = latex_rm_whitespace(latex)
+            mfr_cost = round(time.time() - mfr_start, 2)
+            logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")

        # ocr识别
        if self.apply_ocr:
@@ -189,8 +196,8 @@ class CustomPEKModel:
                paste_x = 50
                paste_y = 50
                # 创建一个宽高各多50的白色背景
-                new_width = xmax - xmin + paste_x*2
-                new_height = ymax - ymin + paste_y*2
+                new_width = xmax - xmin + paste_x * 2
+                new_height = ymax - ymin + paste_y * 2
                new_image = Image.new('RGB', (new_width, new_height), 'white')

                # 裁剪图像