Commit 2a06e0c8 authored by xuchao's avatar xuchao

Make the documentation on how to download the model more concise

parent f052c75e
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
<a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a> <a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
<!-- language --> <!-- language -->
[English](README.md) | [简体中文](README_zh-CN.md) | [日本語](README_ja-JP.md) [English](README.md) | [简体中文](README_zh-CN.md)
<!-- hot link --> <!-- hot link -->
...@@ -305,7 +305,6 @@ TODO ...@@ -305,7 +305,6 @@ TODO
- 漫画书、艺术图册、小学教材、习题尚不能很好解析 - 漫画书、艺术图册、小学教材、习题尚不能很好解析
- 在一些公式密集的PDF上强制启用OCR效果会更好 - 在一些公式密集的PDF上强制启用OCR效果会更好
- 如果您要处理包含大量公式的pdf,强烈建议开启OCR功能。使用pymuPDF提取文字的时候会出现文本行互相重叠的情况导致公式插入位置不准确。 - 如果您要处理包含大量公式的pdf,强烈建议开启OCR功能。使用pymuPDF提取文字的时候会出现文本行互相重叠的情况导致公式插入位置不准确。
-
好消息是,这些我们正在努力实现! 好消息是,这些我们正在努力实现!
......
...@@ -2,38 +2,35 @@ ...@@ -2,38 +2,35 @@
模型文件可以从Hugging Face 或 Model Scope 下载,由于网络原因,国内用户访问HF 可能会失败,请使用 ModelScope。 模型文件可以从Hugging Face 或 Model Scope 下载,由于网络原因,国内用户访问HF 可能会失败,请使用 ModelScope。
[Hugging Face](#从-Hugging-Face-下载模型) [Hugging Face](#从-Hugging-Face-下载模型)
[ModelScope](#从-ModelScope-下载模型) [ModelScope](#从-ModelScope-下载模型)
## 从 Hugging Face 下载模型 ## 方法一:从 Hugging Face 下载模型
### 1.安装 Git LFS
开始之前,请确保您的系统上已安装 Git 大文件存储 (Git LFS)。使用以下命令进行安装
```bash
git lfs install
```
### 2.从 Hugging Face 下载模型 使用Git LFS 从Hugging Face下载模型文件
请使用以下命令从 Hugging Face 下载 PDF-Extract-Kit 模型:
```bash ```bash
git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit git lfs install # 安装 Git 大文件存储插件 (Git LFS)
git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit # 从 Hugging Face 下载 PDF-Extract-Kit 模型
``` ```
确保在克隆过程中启用了 Git LFS,以便正确下载所有大文件。
## 方法二:从 ModelScope 下载模型
ModelScope 支持SDK或模型下载,任选一个即可。
## 从 ModelScope 下载模型 [Git lsf下载](#git下载)
ModelScope 支持SDK或模型下载
[SDK下载](#sdk下载) [SDK下载](#sdk下载)
[Git下载](#git下载) ### 1)利用Git lsf下载
```bash
git lfs install
git lfs clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git
```
### SDK下载 ### 2)利用SDK下载
```bash ```bash
# 首先安装modelscope # 首先安装modelscope
...@@ -46,35 +43,18 @@ from modelscope import snapshot_download ...@@ -46,35 +43,18 @@ from modelscope import snapshot_download
model_dir = snapshot_download('wanderkid/PDF-Extract-Kit') model_dir = snapshot_download('wanderkid/PDF-Extract-Kit')
``` ```
### Git下载
也可以使用git clone从 ModelScope 下载模型:
#### 1.安装 Git LFS
开始之前,请确保您的系统上已安装 Git 大文件存储 (Git LFS)。使用以下命令进行安装
```bash
git lfs install
```
#### 2.然后通过git lfs下载模型
```bash
git lfs clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git
```
## 额外步骤 ## 额外步骤
### 1.检查模型目录是否下载完整 ### 1.检查模型目录是否下载完整
模型文件夹的结构如下,包含了不同组件的配置文件和权重文件: 模型文件夹的结构如下,包含了不同组件的配置文件和权重文件:
``` ```
./ ./
├── Layout ├── Layout # 布局检测模型
│ ├── config.json │ ├── config.json
│ └── model_final.pth │ └── model_final.pth
├── MFD ├── MFD # 公式检测
│ └── weights.pt │ └── weights.pt
├── MFR ├── MFR # 公式识别模型
│ └── UniMERNet │ └── UniMERNet
│ ├── config.json │ ├── config.json
│ ├── preprocessor_config.json │ ├── preprocessor_config.json
...@@ -82,7 +62,7 @@ git lfs clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git ...@@ -82,7 +62,7 @@ git lfs clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git
│ ├── README.md │ ├── README.md
│ ├── tokenizer_config.json │ ├── tokenizer_config.json
│ └── tokenizer.json │ └── tokenizer.json
│── TabRec │── TabRec # 表格识别模型
│ └─StructEqTable │ └─StructEqTable
│ ├── config.json │ ├── config.json
│ ├── generation_config.json │ ├── generation_config.json
...@@ -100,3 +80,4 @@ git lfs clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git ...@@ -100,3 +80,4 @@ git lfs clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git
### 3.移动模型到固态硬盘 ### 3.移动模型到固态硬盘
将 'models' 目录移动到具有较大磁盘空间的目录中,最好是在固态硬盘(SSD)上。 将 'models' 目录移动到具有较大磁盘空间的目录中,最好是在固态硬盘(SSD)上。
此外在 `~/magic-pdf.json`里修改模型的目录指向最终的模型存放位置,否则会报模型无法加载的错误。
import os
import json as json_parse
import click
from loguru import logger
from pathlib import Path
from magic_pdf.libs.version import __version__
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.libs.path_utils import (
parse_s3path,
parse_s3_range_params,
remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import (
get_local_dir,
get_s3_config,
)
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
import csv
import copy
import magic_pdf.model as model_config
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
def prepare_env(pdf_file_name, method):
local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
local_image_dir = os.path.join(str(local_parent_dir), "images")
local_md_dir = local_parent_dir
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
return local_image_dir, local_md_dir
def write_to_csv(csv_file_path, csv_data):
with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
# 创建csv writer对象
csv_writer = csv.writer(csvfile)
# 写入数据
csv_writer.writerow(csv_data)
logger.info(f"数据已成功追加到 '{csv_file_path}'")
def do_parse(
pdf_file_name,
pdf_bytes,
model_list,
parse_method,
f_draw_span_bbox=True,
f_draw_layout_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_json=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
):
orig_model_list = copy.deepcopy(model_list)
local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
image_dir = str(os.path.basename(local_image_dir))
if parse_method == "auto":
jso_useful_key = {"_pdf_type": "", "model_list": model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
elif parse_method == "txt":
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
elif parse_method == "ocr":
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
else:
logger.error("unknown parse method")
exit(1)
pipe.pipe_classify()
"""如果没有传入有效的模型数据,则使用内置model解析"""
if len(model_list) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
orig_model_list = copy.deepcopy(pipe.model_list)
else:
logger.error("need model list input")
exit(1)
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
if f_dump_md:
"""写markdown"""
md_writer.write(
content=md_content,
path=f"{pdf_file_name}.md",
mode=AbsReaderWriter.MODE_TXT,
)
if f_dump_middle_json:
"""写middle_json"""
md_writer.write(
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
path=f"{pdf_file_name}_middle.json",
mode=AbsReaderWriter.MODE_TXT,
)
if f_dump_model_json:
"""写model_json"""
md_writer.write(
content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
path=f"{pdf_file_name}_model.json",
mode=AbsReaderWriter.MODE_TXT,
)
if f_dump_orig_pdf:
"""写源pdf"""
md_writer.write(
content=pdf_bytes,
path=f"{pdf_file_name}_origin.pdf",
mode=AbsReaderWriter.MODE_BIN,
)
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
if f_dump_content_list:
"""写content_list"""
md_writer.write(
content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
path=f"{pdf_file_name}_content_list.json",
mode=AbsReaderWriter.MODE_TXT,
)
logger.info(f"local output dir is '{local_md_dir}', you can found the result in it.")
@click.group()
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
@click.help_option("--help", "-h", help="显示帮助信息")
def cli():
pass
@cli.command()
@click.option("--json", type=str, help="输入一个S3路径")
@click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
@click.option("--model_mode", type=click.STRING, default="full",
help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
def json_command(json, method, inside_model, model_mode):
model_config.__use_inside_model__ = inside_model
model_config.__model_mode__ = model_mode
if not json.startswith("s3://"):
logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
exit(1)
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
)
may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None
else:
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
byte_end += byte_start - 1
return s3_rw.read_jsonl(
remove_non_official_s3_args(s3path),
byte_start,
byte_end,
AbsReaderWriter.MODE_BIN,
)
jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
s3_file_path = jso.get("file_location")
if s3_file_path is None:
s3_file_path = jso.get("path")
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
do_parse(
pdf_file_name,
pdf_data,
jso["doc_layout_result"],
method,
)
@cli.command()
@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
@click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
@click.option("--model_mode", type=click.STRING, default="full",
help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
def local_json_command(local_json, method, inside_model, model_mode):
model_config.__use_inside_model__ = inside_model
model_config.__model_mode__ = model_mode
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
)
may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None
else:
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
byte_end += byte_start - 1
return s3_rw.read_jsonl(
remove_non_official_s3_args(s3path),
byte_start,
byte_end,
AbsReaderWriter.MODE_BIN,
)
with open(local_json, "r", encoding="utf-8") as f:
for json_line in f:
jso = json_parse.loads(json_line)
s3_file_path = jso.get("file_location")
if s3_file_path is None:
s3_file_path = jso.get("path")
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
do_parse(
pdf_file_name,
pdf_data,
jso["doc_layout_result"],
method,
)
@cli.command()
@click.option(
"--pdf", type=click.Path(exists=True), required=True,
help='pdf 文件路径, 支持单个文件或文件列表, 文件列表需要以".list"结尾, 一行一个pdf文件路径')
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
@click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
@click.option("--model_mode", type=click.STRING, default="full",
help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
def pdf_command(pdf, model, method, inside_model, model_mode):
model_config.__use_inside_model__ = inside_model
model_config.__model_mode__ = model_mode
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
def get_model_json(model_path, doc_path):
# 这里处理pdf和模型相关的逻辑
if model_path is None:
file_name_without_extension, extension = os.path.splitext(doc_path)
if extension == ".pdf":
model_path = file_name_without_extension + ".json"
else:
raise Exception("pdf_path input error")
if not os.path.exists(model_path):
logger.warning(
f"not found json {model_path} existed"
)
# 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
model_json = "[]"
else:
model_json = read_fn(model_path).decode("utf-8")
else:
model_json = read_fn(model_path).decode("utf-8")
return model_json
def parse_doc(doc_path):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
jso = json_parse.loads(get_model_json(model, doc_path))
do_parse(
file_name,
pdf_data,
jso,
method,
)
except Exception as e:
logger.exception(e)
if not pdf:
logger.error(f"Error: Missing argument '--pdf'.")
exit(f"Error: Missing argument '--pdf'.")
else:
'''适配多个文档的list文件输入'''
if pdf.endswith(".list"):
with open(pdf, "r") as f:
for line in f.readlines():
line = line.strip()
parse_doc(line)
else:
'''适配单个文档的输入'''
parse_doc(pdf)
if __name__ == "__main__":
"""
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
"""
cli()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment