Commit 7b712c40 authored by liukaiwen's avatar liukaiwen

Merge branch 'master' of github.com:papayalove/Magic-PDF

parents b29badc1 3aec9c61
...@@ -99,7 +99,7 @@ conda activate MinerU ...@@ -99,7 +99,7 @@ conda activate MinerU
> >
> 如需在生产环境使用CUDA/MPS加速请参考[使用CUDA或MPS加速推理](#4-使用CUDA或MPS加速推理) > 如需在生产环境使用CUDA/MPS加速请参考[使用CUDA或MPS加速推理](#4-使用CUDA或MPS加速推理)
```bash ```bash
pip install magic-pdf[full-cpu] pip install magic-pdf[full-cpu] -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
> ❗️已收到多起由于镜像源和依赖冲突问题导致安装了错误版本软件包的反馈,请务必安装完成后通过以下命令验证版本是否正确 > ❗️已收到多起由于镜像源和依赖冲突问题导致安装了错误版本软件包的反馈,请务必安装完成后通过以下命令验证版本是否正确
> ```bash > ```bash
...@@ -111,7 +111,7 @@ pip install magic-pdf[full-cpu] ...@@ -111,7 +111,7 @@ pip install magic-pdf[full-cpu]
或是直接使用我们预编译的whl包: 或是直接使用我们预编译的whl包:
> ❗️预编译版本仅支持64位系统(windows/linux/macOS)+pyton 3.10平台;不支持任何32位系统和非mac的arm平台,如系统不支持请自行编译安装。 > ❗️预编译版本仅支持64位系统(windows/linux/macOS)+pyton 3.10平台;不支持任何32位系统和非mac的arm平台,如系统不支持请自行编译安装。
```bash ```bash
pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/ pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/ -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
#### 2. 下载模型权重文件 #### 2. 下载模型权重文件
......
This diff is collapsed.
...@@ -33,6 +33,22 @@ model_dir = snapshot_download('wanderkid/PDF-Extract-Kit') ...@@ -33,6 +33,22 @@ model_dir = snapshot_download('wanderkid/PDF-Extract-Kit')
#### Git下载 #### Git下载
也可以使用git clone从 ModelScope 下载模型: 也可以使用git clone从 ModelScope 下载模型:
需要先安装git lfs
>##### On Linux
>
>Debian and RPM packages are available from packagecloud, see the [Linux installation instructions](INSTALLING.md).
>
>##### On macOS
>
>[Homebrew](https://brew.sh) bottles are distributed and can be installed via `brew install git-lfs`.
>
>##### On Windows
>
>Git LFS is included in the distribution of [Git for Windows](https://gitforwindows.org/).
>Alternatively, you can install a recent version of Git LFS from the [Chocolatey](https://chocolatey.org/) package manager.
然后通过git clone下载模型:
```bash ```bash
git clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git git clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git
``` ```
......
...@@ -214,28 +214,32 @@ def para_to_standard_format(para, img_buket_path): ...@@ -214,28 +214,32 @@ def para_to_standard_format(para, img_buket_path):
return para_content return para_content
def para_to_standard_format_v2(para_block, img_buket_path): def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
para_type = para_block['type'] para_type = para_block['type']
if para_type == BlockType.Text: if para_type == BlockType.Text:
para_content = { para_content = {
'type': 'text', 'type': 'text',
'text': merge_para_with_text(para_block), 'text': merge_para_with_text(para_block),
'page_idx': page_idx
} }
elif para_type == BlockType.Title: elif para_type == BlockType.Title:
para_content = { para_content = {
'type': 'text', 'type': 'text',
'text': merge_para_with_text(para_block), 'text': merge_para_with_text(para_block),
'text_level': 1 'text_level': 1,
'page_idx': page_idx
} }
elif para_type == BlockType.InterlineEquation: elif para_type == BlockType.InterlineEquation:
para_content = { para_content = {
'type': 'equation', 'type': 'equation',
'text': merge_para_with_text(para_block), 'text': merge_para_with_text(para_block),
'text_format': "latex" 'text_format': "latex",
'page_idx': page_idx
} }
elif para_type == BlockType.Image: elif para_type == BlockType.Image:
para_content = { para_content = {
'type': 'image', 'type': 'image',
'page_idx': page_idx
} }
for block in para_block['blocks']: for block in para_block['blocks']:
if block['type'] == BlockType.ImageBody: if block['type'] == BlockType.ImageBody:
...@@ -245,6 +249,7 @@ def para_to_standard_format_v2(para_block, img_buket_path): ...@@ -245,6 +249,7 @@ def para_to_standard_format_v2(para_block, img_buket_path):
elif para_type == BlockType.Table: elif para_type == BlockType.Table:
para_content = { para_content = {
'type': 'table', 'type': 'table',
'page_idx': page_idx
} }
for block in para_block['blocks']: for block in para_block['blocks']:
if block['type'] == BlockType.TableBody: if block['type'] == BlockType.TableBody:
...@@ -352,6 +357,7 @@ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_pa ...@@ -352,6 +357,7 @@ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_pa
raise Exception(f"drop_mode can not be null") raise Exception(f"drop_mode can not be null")
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
page_idx = page_info.get("page_idx")
if not paras_of_layout: if not paras_of_layout:
continue continue
if make_mode == MakeMode.MM_MD: if make_mode == MakeMode.MM_MD:
...@@ -362,7 +368,7 @@ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_pa ...@@ -362,7 +368,7 @@ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_pa
output_content.extend(page_markdown) output_content.extend(page_markdown)
elif make_mode == MakeMode.STANDARD_FORMAT: elif make_mode == MakeMode.STANDARD_FORMAT:
for para_block in paras_of_layout: for para_block in paras_of_layout:
para_content = para_to_standard_format_v2(para_block, img_buket_path) para_content = para_to_standard_format_v2(para_block, img_buket_path, page_idx)
output_content.append(para_content) output_content.append(para_content)
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
return '\n\n'.join(output_content) return '\n\n'.join(output_content)
......
...@@ -68,6 +68,8 @@ def custom_model_init(ocr: bool = False, show_log: bool = False): ...@@ -68,6 +68,8 @@ def custom_model_init(ocr: bool = False, show_log: bool = False):
model = None model = None
if model_config.__model_mode__ == "lite": if model_config.__model_mode__ == "lite":
logger.warning("The Lite mode is provided for developers to conduct testing only, and the output quality is "
"not guaranteed to be reliable.")
model = MODEL.Paddle model = MODEL.Paddle
elif model_config.__model_mode__ == "full": elif model_config.__model_mode__ == "full":
model = MODEL.PEK model = MODEL.PEK
......
...@@ -36,7 +36,7 @@ if __name__ == '__main__': ...@@ -36,7 +36,7 @@ if __name__ == '__main__':
"paddlepaddle==3.0.0b1;platform_system=='Linux'", "paddlepaddle==3.0.0b1;platform_system=='Linux'",
"paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'", "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",
], ],
"full": ["unimernet", "full": ["unimernet==0.1.6",
"matplotlib", "matplotlib",
"ultralytics", "ultralytics",
"paddleocr==2.7.3", "paddleocr==2.7.3",
......
...@@ -7,6 +7,14 @@ ...@@ -7,6 +7,14 @@
"created_at": "2024-07-28T15:55:21Z", "created_at": "2024-07-28T15:55:21Z",
"repoId": 765083837, "repoId": 765083837,
"pullRequestNo": 231 "pullRequestNo": 231
},
{
"name": "nutshellfool",
"id": 1439114,
"comment_id": 2259763094,
"created_at": "2024-07-31T06:24:39Z",
"repoId": 765083837,
"pullRequestNo": 258
} }
] ]
} }
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment