Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
6f58eeab
Commit
6f58eeab
authored
Aug 28, 2024
by
drunkpig
Browse files
Options
Browse Files
Download
Plain Diff
merge: sync from master branch
parents
9067cd31
7f0fe200
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
362 additions
and
237 deletions
+362
-237
Dockerfile
Dockerfile
+1
-1
README_zh-CN.md.bak
README_zh-CN.md.bak
+287
-185
download_models.py
docs/download_models.py
+4
-0
how_to_download_models_en.md
docs/how_to_download_models_en.md
+1
-1
how_to_download_models_zh_cn.md
docs/how_to_download_models_zh_cn.md
+3
-3
para_split_v2.py
magic_pdf/para/para_split_v2.py
+50
-47
cla.json
signatures/version1/cla.json
+16
-0
No files found.
Dockerfile
View file @
6f58eeab
...
@@ -48,4 +48,4 @@ RUN /bin/bash -c "pip3 install modelscope && \
...
@@ -48,4 +48,4 @@ RUN /bin/bash -c "pip3 install modelscope && \
sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
# Set the entry point to activate the virtual environment and run the command line tool
# Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT
["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
ENTRYPOINT
["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
\ No newline at end of file
README_zh-CN.md.bak
View file @
6f58eeab
This diff is collapsed.
Click to expand it.
docs/download_models.py
0 → 100644
View file @
6f58eeab
# use modelscope sdk download models
from
modelscope
import
snapshot_download
model_dir
=
snapshot_download
(
'opendatalab/PDF-Extract-Kit'
)
print
(
f
"model dir is: {model_dir}/models"
)
docs/how_to_download_models_en.md
View file @
6f58eeab
...
@@ -9,7 +9,7 @@ git lfs install
...
@@ -9,7 +9,7 @@ git lfs install
To download the
`PDF-Extract-Kit`
model from Hugging Face, use the following command:
To download the
`PDF-Extract-Kit`
model from Hugging Face, use the following command:
```
bash
```
bash
git lfs clone https://huggingface.co/
wanderkid
/PDF-Extract-Kit
git lfs clone https://huggingface.co/
opendatalab
/PDF-Extract-Kit
```
```
Ensure that Git LFS is enabled during the clone to properly download all large files.
Ensure that Git LFS is enabled during the clone to properly download all large files.
...
...
docs/how_to_download_models_zh_cn.md
View file @
6f58eeab
...
@@ -13,7 +13,7 @@
...
@@ -13,7 +13,7 @@
```
bash
```
bash
git lfs
install
# 安装 Git 大文件存储插件 (Git LFS)
git lfs
install
# 安装 Git 大文件存储插件 (Git LFS)
git lfs clone https://huggingface.co/
wanderkid
/PDF-Extract-Kit
# 从 Hugging Face 下载 PDF-Extract-Kit 模型
git lfs clone https://huggingface.co/
opendatalab
/PDF-Extract-Kit
# 从 Hugging Face 下载 PDF-Extract-Kit 模型
```
```
...
@@ -28,7 +28,7 @@ ModelScope 支持SDK或模型下载,任选一个即可。
...
@@ -28,7 +28,7 @@ ModelScope 支持SDK或模型下载,任选一个即可。
```
bash
```
bash
git lfs
install
git lfs
install
git lfs clone https://www.modelscope.cn/
wanderkid
/PDF-Extract-Kit.git
git lfs clone https://www.modelscope.cn/
opendatalab
/PDF-Extract-Kit.git
```
```
### 2)利用SDK下载
### 2)利用SDK下载
...
@@ -41,7 +41,7 @@ pip install modelscope
...
@@ -41,7 +41,7 @@ pip install modelscope
```
python
```
python
# 使用modelscope sdk下载模型
# 使用modelscope sdk下载模型
from
modelscope
import
snapshot_download
from
modelscope
import
snapshot_download
model_dir
=
snapshot_download
(
'
wanderkid
/PDF-Extract-Kit'
)
model_dir
=
snapshot_download
(
'
opendatalab
/PDF-Extract-Kit'
)
print
(
f
"模型文件下载路径为:{model_dir}/models"
)
print
(
f
"模型文件下载路径为:{model_dir}/models"
)
```
```
...
...
magic_pdf/para/para_split_v2.py
View file @
6f58eeab
...
@@ -100,59 +100,62 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
...
@@ -100,59 +100,62 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
if
lang
!=
'en'
:
if
lang
!=
'en'
:
return
lines
,
None
return
lines
,
None
else
:
total_lines
=
len
(
lines
)
total_lines
=
len
(
lines
)
line_fea_encode
=
[]
line_fea_encode
=
[]
"""
"""
对每一行进行特征编码,编码规则如下:
对每一行进行特征编码,编码规则如下:
1. 如果行顶格,且大写字母开头或者数字开头,编码为1
1. 如果行顶格,且大写字母开头或者数字开头,编码为1
2. 如果顶格,其他非大写开头编码为4
2. 如果顶格,其他非大写开头编码为4
3. 如果非顶格,首字符大写,编码为2
3. 如果非顶格,首字符大写,编码为2
4. 如果非顶格,首字符非大写编码为3
4. 如果非顶格,首字符非大写编码为3
"""
"""
if
len
(
lines
)
>
0
:
if
len
(
lines
)
>
0
:
x_map_tag_dict
,
min_x_tag
=
cluster_line_x
(
lines
)
x_map_tag_dict
,
min_x_tag
=
cluster_line_x
(
lines
)
for
l
in
lines
:
for
l
in
lines
:
span_text
=
__get_span_text
(
l
[
'spans'
][
0
])
span_text
=
__get_span_text
(
l
[
'spans'
][
0
])
first_char
=
span_text
[
0
]
if
not
span_text
:
layout
=
__find_layout_bbox_by_line
(
l
[
'bbox'
],
new_layout_bboxes
)
line_fea_encode
.
append
(
0
)
if
not
layout
:
continue
line_fea_encode
.
append
(
0
)
first_char
=
span_text
[
0
]
layout
=
__find_layout_bbox_by_line
(
l
[
'bbox'
],
new_layout_bboxes
)
if
not
layout
:
line_fea_encode
.
append
(
0
)
else
:
#
if
x_map_tag_dict
[
round
(
l
[
'bbox'
][
0
])]
==
min_x_tag
:
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
if
not
first_char
.
isalnum
()
or
if_match_reference_list
(
span_text
):
line_fea_encode
.
append
(
1
)
else
:
line_fea_encode
.
append
(
4
)
else
:
else
:
#
if
first_char
.
isupper
():
if
x_map_tag_dict
[
round
(
l
[
'bbox'
][
0
])]
==
min_x_tag
:
line_fea_encode
.
append
(
2
)
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
if
not
first_char
.
isalnum
()
or
if_match_reference_list
(
span_text
):
line_fea_encode
.
append
(
1
)
else
:
line_fea_encode
.
append
(
4
)
else
:
else
:
if
first_char
.
isupper
():
line_fea_encode
.
append
(
3
)
line_fea_encode
.
append
(
2
)
else
:
line_fea_encode
.
append
(
3
)
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
list_indice
,
list_start_idx
=
find_repeating_patterns2
(
line_fea_encode
)
list_indice
,
list_start_idx
=
find_repeating_patterns2
(
line_fea_encode
)
if
len
(
list_indice
)
>
0
:
if
len
(
list_indice
)
>
0
:
if
debug_able
:
logger
.
info
(
f
"发现了列表,列表行数:{list_indice}, {list_start_idx}"
)
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments
=
[]
for
start
,
end
in
list_indice
:
for
i
in
range
(
start
,
end
+
1
):
if
i
>
0
:
if
line_fea_encode
[
i
]
==
4
:
if
debug_able
:
logger
.
info
(
f
"列表行的第{i}行不是顶格的"
)
break
else
:
if
debug_able
:
if
debug_able
:
logger
.
info
(
f
"发现了列表,列表行数:{list_indice}, {list_start_idx}"
)
logger
.
info
(
f
"列表行的第{start}到第{end}行是列表"
)
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments
=
[]
for
start
,
end
in
list_indice
:
for
i
in
range
(
start
,
end
+
1
):
if
i
>
0
:
if
line_fea_encode
[
i
]
==
4
:
if
debug_able
:
logger
.
info
(
f
"列表行的第{i}行不是顶格的"
)
break
else
:
if
debug_able
:
logger
.
info
(
f
"列表行的第{start}到第{end}行是列表"
)
return
split_indices
(
total_lines
,
list_indice
),
list_start_idx
return
split_indices
(
total_lines
,
list_indice
),
list_start_idx
def
cluster_line_x
(
lines
:
list
)
->
dict
:
def
cluster_line_x
(
lines
:
list
)
->
dict
:
...
...
signatures/version1/cla.json
View file @
6f58eeab
...
@@ -31,6 +31,22 @@
...
@@ -31,6 +31,22 @@
"created_at"
:
"2024-08-13T12:23:16Z"
,
"created_at"
:
"2024-08-13T12:23:16Z"
,
"repoId"
:
765083837
,
"repoId"
:
765083837
,
"pullRequestNo"
:
418
"pullRequestNo"
:
418
},
{
"name"
:
"Matthijz98"
,
"id"
:
17087153
,
"comment_id"
:
2298912989
,
"created_at"
:
"2024-08-20T13:49:50Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
467
},
{
"name"
:
"strongerfly"
,
"id"
:
11643869
,
"comment_id"
:
2309481561
,
"created_at"
:
"2024-08-26T07:01:49Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
487
}
}
]
]
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment