Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
ca737d5c
Commit
ca737d5c
authored
Jul 24, 2024
by
myhloli
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/master'
parents
f8599d2b
3fc9943f
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
88 additions
and
44 deletions
+88
-44
README_zh-CN.md
README_zh-CN.md
+5
-2
FAQ_zh_cn.md
docs/FAQ_zh_cn.md
+17
-1
magicpdf.py
magic_pdf/cli/magicpdf.py
+1
-1
config_reader.py
magic_pdf/libs/config_reader.py
+24
-6
pdf_extract_kit.py
magic_pdf/model/pdf_extract_kit.py
+41
-34
No files found.
README_zh-CN.md
View file @
ca737d5c
...
...
@@ -121,11 +121,14 @@ pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
下载后请将models目录移动到空间较大的ssd磁盘目录
#### 3. 拷贝配置文件并进行配置
在仓库根目录可以获得
[
magic-pdf.template.json
](
magic-pdf.template.json
)
文件
在仓库根目录可以获得
[
magic-pdf.template.json
](
magic-pdf.template.json
)
配置模版文件
> ❗️务必执行以下命令将配置文件拷贝到用户目录下,否则程序将无法运行
```
bash
cp
magic-pdf.template.json ~/magic-pdf.json
```
在magic-pdf.json中配置"models-dir"为模型权重文件所在目录
在用户目录中找到magic-pdf.json文件并配置"models-dir"为
[
2. 下载模型权重文件
](
#2-下载模型权重文件
)
中下载的模型权重文件所在目录
> ❗️务必正确配置模型权重文件所在目录,否则会因为找不到模型文件而导致程序无法运行
```
json
{
"models-dir"
:
"/tmp/models"
...
...
docs/FAQ_zh_cn.md
View file @
ca737d5c
...
...
@@ -22,6 +22,7 @@ pip install magic-pdf[full-cpu]
pip
install
magic-pdf
pip
install
unimernet
==
0.1.0
pip
install
matplotlib ultralytics
paddleocr
==
2.7.3 paddlepaddle
pip
install
detectron2
--extra-index-url
https://myhloli.github.io/wheels/
```
### 4.在部分较新的M芯片macOS设备上,MPS加速开启失败
...
...
@@ -82,4 +83,19 @@ pip install paddlepaddle-gpu
model_json 指的是通过模型分析后生成的一种有特定格式的json文件。
如果使用 https://github.com/opendatalab/PDF-Extract-Kit 项目生成,该文件一般在项目的output目录下。
如果使用 MinerU 的命令行调用内置的模型分析,该文件一般在输出路径"/tmp/magic-pdf/pdf-name"下。
参考:https://github.com/opendatalab/MinerU/issues/128
\ No newline at end of file
参考:https://github.com/opendatalab/MinerU/issues/128
### 10.Linux下报错:Required dependency not installed, please install by "pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"
这种情况可以先使用pip list 检查一下自己的依赖库列表,重点确认下以下几个库有没有安装(版本不一定完全一致,有就可以)
```
bash
opencv-contrib-python 4.6.0.66
opencv-python 4.6.0.66
opencv-python-headless 4.10.0.84
paddleocr 2.7.3
paddlepaddle 2.6.1
torch 2.2.2
torchtext 0.17.2
torchvision 0.17.2
```
如果都有的话,可能是libgl库没有安装,参考 https://github.com/opendatalab/MinerU/issues/165#issuecomment-2245202282 安装libgl库后再试试能不能正常使用。
magic_pdf/cli/magicpdf.py
View file @
ca737d5c
...
...
@@ -89,7 +89,6 @@ def do_parse(
orig_model_list
=
copy
.
deepcopy
(
model_list
)
local_image_dir
,
local_md_dir
=
prepare_env
(
pdf_file_name
,
parse_method
)
logger
.
info
(
f
"local output dir is {local_md_dir}"
)
image_writer
,
md_writer
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
)
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
...
...
@@ -163,6 +162,7 @@ def do_parse(
path
=
f
"{pdf_file_name}_content_list.json"
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
)
logger
.
info
(
f
"local output dir is '{local_md_dir}', you can found the result in it."
)
@
click
.
group
()
...
...
magic_pdf/libs/config_reader.py
View file @
ca737d5c
...
...
@@ -10,14 +10,17 @@ from loguru import logger
from
magic_pdf.libs.commons
import
parse_bucket_key
# 定义配置文件名常量
CONFIG_FILE_NAME
=
"magic-pdf.json"
def
read_config
():
home_dir
=
os
.
path
.
expanduser
(
"~"
)
config_file
=
os
.
path
.
join
(
home_dir
,
"magic-pdf.json"
)
config_file
=
os
.
path
.
join
(
home_dir
,
CONFIG_FILE_NAME
)
if
not
os
.
path
.
exists
(
config_file
):
raise
Exception
(
f
"{config_file} not found"
)
raise
FileNotFoundError
(
f
"{config_file} not found"
)
with
open
(
config_file
,
"r"
)
as
f
:
config
=
json
.
load
(
f
)
...
...
@@ -37,7 +40,7 @@ def get_s3_config(bucket_name: str):
access_key
,
secret_key
,
storage_endpoint
=
bucket_info
[
bucket_name
]
if
access_key
is
None
or
secret_key
is
None
or
storage_endpoint
is
None
:
raise
Exception
(
"ak, sk or endpoint not found in magic-pdf.json
"
)
raise
Exception
(
f
"ak, sk or endpoint not found in {CONFIG_FILE_NAME}
"
)
# logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
...
...
@@ -56,17 +59,32 @@ def get_bucket_name(path):
def
get_local_dir
():
config
=
read_config
()
return
config
.
get
(
"temp-output-dir"
,
"/tmp"
)
local_dir
=
config
.
get
(
"temp-output-dir"
)
if
local_dir
is
None
:
logger
.
warning
(
f
"'temp-output-dir' not found in {CONFIG_FILE_NAME}, use '/tmp' as default"
)
return
"/tmp"
else
:
return
local_dir
def
get_local_models_dir
():
config
=
read_config
()
return
config
.
get
(
"models-dir"
,
"/tmp/models"
)
models_dir
=
config
.
get
(
"models-dir"
)
if
models_dir
is
None
:
logger
.
warning
(
f
"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default"
)
return
"/tmp/models"
else
:
return
models_dir
def
get_device
():
config
=
read_config
()
return
config
.
get
(
"device-mode"
,
"cpu"
)
device
=
config
.
get
(
"device-mode"
)
if
device
is
None
:
logger
.
warning
(
f
"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default"
)
return
"cpu"
else
:
return
device
if
__name__
==
"__main__"
:
...
...
magic_pdf/model/pdf_extract_kit.py
View file @
ca737d5c
from
loguru
import
logger
import
os
import
time
os
.
environ
[
'NO_ALBUMENTATIONS_UPDATE'
]
=
'1'
# 禁止albumentations检查更新
try
:
import
cv2
import
yaml
...
...
@@ -17,14 +19,17 @@ try:
import
unimernet.tasks
as
tasks
from
unimernet.processors
import
load_processor
from
magic_pdf.model.pek_sub_modules.layoutlmv3.model_init
import
Layoutlmv3_Predictor
from
magic_pdf.model.pek_sub_modules.post_process
import
get_croped_image
,
latex_rm_whitespace
from
magic_pdf.model.pek_sub_modules.self_modify
import
ModifiedPaddleOCR
except
ImportError
as
e
:
logger
.
exception
(
e
)
logger
.
error
(
'Required dependency not installed, please install by
\n
"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"'
)
logger
.
error
(
'Required dependency not installed, please install by
\n
'
'"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"'
)
exit
(
1
)
from
magic_pdf.model.pek_sub_modules.layoutlmv3.model_init
import
Layoutlmv3_Predictor
from
magic_pdf.model.pek_sub_modules.post_process
import
get_croped_image
,
latex_rm_whitespace
from
magic_pdf.model.pek_sub_modules.self_modify
import
ModifiedPaddleOCR
def
mfd_model_init
(
weight
):
mfd_model
=
YOLO
(
weight
)
...
...
@@ -100,6 +105,7 @@ class CustomPEKModel:
self
.
device
=
kwargs
.
get
(
"device"
,
self
.
configs
[
"config"
][
"device"
])
logger
.
info
(
"using device: {}"
.
format
(
self
.
device
))
models_dir
=
kwargs
.
get
(
"models_dir"
,
os
.
path
.
join
(
root_dir
,
"resources"
,
"models"
))
logger
.
info
(
"using models_dir: {}"
.
format
(
models_dir
))
# 初始化公式识别
if
self
.
apply_formula
:
...
...
@@ -135,34 +141,35 @@ class CustomPEKModel:
layout_cost
=
round
(
time
.
time
()
-
layout_start
,
2
)
logger
.
info
(
f
"layout detection cost: {layout_cost}"
)
# 公式检测
mfd_res
=
self
.
mfd_model
.
predict
(
image
,
imgsz
=
1888
,
conf
=
0.25
,
iou
=
0.45
,
verbose
=
True
)[
0
]
for
xyxy
,
conf
,
cla
in
zip
(
mfd_res
.
boxes
.
xyxy
.
cpu
(),
mfd_res
.
boxes
.
conf
.
cpu
(),
mfd_res
.
boxes
.
cls
.
cpu
()):
xmin
,
ymin
,
xmax
,
ymax
=
[
int
(
p
.
item
())
for
p
in
xyxy
]
new_item
=
{
'category_id'
:
13
+
int
(
cla
.
item
()),
'poly'
:
[
xmin
,
ymin
,
xmax
,
ymin
,
xmax
,
ymax
,
xmin
,
ymax
],
'score'
:
round
(
float
(
conf
.
item
()),
2
),
'latex'
:
''
,
}
layout_res
.
append
(
new_item
)
latex_filling_list
.
append
(
new_item
)
bbox_img
=
get_croped_image
(
Image
.
fromarray
(
image
),
[
xmin
,
ymin
,
xmax
,
ymax
])
mf_image_list
.
append
(
bbox_img
)
# 公式识别
mfr_start
=
time
.
time
()
dataset
=
MathDataset
(
mf_image_list
,
transform
=
self
.
mfr_transform
)
dataloader
=
DataLoader
(
dataset
,
batch_size
=
64
,
num_workers
=
0
)
mfr_res
=
[]
for
mf_img
in
dataloader
:
mf_img
=
mf_img
.
to
(
self
.
device
)
output
=
self
.
mfr_model
.
generate
({
'image'
:
mf_img
})
mfr_res
.
extend
(
output
[
'pred_str'
])
for
res
,
latex
in
zip
(
latex_filling_list
,
mfr_res
):
res
[
'latex'
]
=
latex_rm_whitespace
(
latex
)
mfr_cost
=
round
(
time
.
time
()
-
mfr_start
,
2
)
logger
.
info
(
f
"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}"
)
if
self
.
apply_formula
:
# 公式检测
mfd_res
=
self
.
mfd_model
.
predict
(
image
,
imgsz
=
1888
,
conf
=
0.25
,
iou
=
0.45
,
verbose
=
True
)[
0
]
for
xyxy
,
conf
,
cla
in
zip
(
mfd_res
.
boxes
.
xyxy
.
cpu
(),
mfd_res
.
boxes
.
conf
.
cpu
(),
mfd_res
.
boxes
.
cls
.
cpu
()):
xmin
,
ymin
,
xmax
,
ymax
=
[
int
(
p
.
item
())
for
p
in
xyxy
]
new_item
=
{
'category_id'
:
13
+
int
(
cla
.
item
()),
'poly'
:
[
xmin
,
ymin
,
xmax
,
ymin
,
xmax
,
ymax
,
xmin
,
ymax
],
'score'
:
round
(
float
(
conf
.
item
()),
2
),
'latex'
:
''
,
}
layout_res
.
append
(
new_item
)
latex_filling_list
.
append
(
new_item
)
bbox_img
=
get_croped_image
(
Image
.
fromarray
(
image
),
[
xmin
,
ymin
,
xmax
,
ymax
])
mf_image_list
.
append
(
bbox_img
)
# 公式识别
mfr_start
=
time
.
time
()
dataset
=
MathDataset
(
mf_image_list
,
transform
=
self
.
mfr_transform
)
dataloader
=
DataLoader
(
dataset
,
batch_size
=
64
,
num_workers
=
0
)
mfr_res
=
[]
for
mf_img
in
dataloader
:
mf_img
=
mf_img
.
to
(
self
.
device
)
output
=
self
.
mfr_model
.
generate
({
'image'
:
mf_img
})
mfr_res
.
extend
(
output
[
'pred_str'
])
for
res
,
latex
in
zip
(
latex_filling_list
,
mfr_res
):
res
[
'latex'
]
=
latex_rm_whitespace
(
latex
)
mfr_cost
=
round
(
time
.
time
()
-
mfr_start
,
2
)
logger
.
info
(
f
"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}"
)
# ocr识别
if
self
.
apply_ocr
:
...
...
@@ -189,8 +196,8 @@ class CustomPEKModel:
paste_x
=
50
paste_y
=
50
# 创建一个宽高各多50的白色背景
new_width
=
xmax
-
xmin
+
paste_x
*
2
new_height
=
ymax
-
ymin
+
paste_y
*
2
new_width
=
xmax
-
xmin
+
paste_x
*
2
new_height
=
ymax
-
ymin
+
paste_y
*
2
new_image
=
Image
.
new
(
'RGB'
,
(
new_width
,
new_height
),
'white'
)
# 裁剪图像
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment