Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
6f58eeab
Commit
6f58eeab
authored
Aug 28, 2024
by
drunkpig
Browse files
Options
Browse Files
Download
Plain Diff
merge: sync from master branch
parents
9067cd31
7f0fe200
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
362 additions
and
237 deletions
+362
-237
Dockerfile
Dockerfile
+1
-1
README_zh-CN.md.bak
README_zh-CN.md.bak
+287
-185
download_models.py
docs/download_models.py
+4
-0
how_to_download_models_en.md
docs/how_to_download_models_en.md
+1
-1
how_to_download_models_zh_cn.md
docs/how_to_download_models_zh_cn.md
+3
-3
para_split_v2.py
magic_pdf/para/para_split_v2.py
+50
-47
cla.json
signatures/version1/cla.json
+16
-0
No files found.
Dockerfile
View file @
6f58eeab
...
...
@@ -48,4 +48,4 @@ RUN /bin/bash -c "pip3 install modelscope && \
sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
# Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT
["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
ENTRYPOINT
["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
\ No newline at end of file
README_zh-CN.md.bak
View file @
6f58eeab
This diff is collapsed.
Click to expand it.
docs/download_models.py
0 → 100644
View file @
6f58eeab
# use modelscope sdk download models
from
modelscope
import
snapshot_download
model_dir
=
snapshot_download
(
'opendatalab/PDF-Extract-Kit'
)
print
(
f
"model dir is: {model_dir}/models"
)
docs/how_to_download_models_en.md
View file @
6f58eeab
...
...
@@ -9,7 +9,7 @@ git lfs install
To download the
`PDF-Extract-Kit`
model from Hugging Face, use the following command:
```
bash
git lfs clone https://huggingface.co/
wanderkid
/PDF-Extract-Kit
git lfs clone https://huggingface.co/
opendatalab
/PDF-Extract-Kit
```
Ensure that Git LFS is enabled during the clone to properly download all large files.
...
...
docs/how_to_download_models_zh_cn.md
View file @
6f58eeab
...
...
@@ -13,7 +13,7 @@
```
bash
git lfs
install
# 安装 Git 大文件存储插件 (Git LFS)
git lfs clone https://huggingface.co/
wanderkid
/PDF-Extract-Kit
# 从 Hugging Face 下载 PDF-Extract-Kit 模型
git lfs clone https://huggingface.co/
opendatalab
/PDF-Extract-Kit
# 从 Hugging Face 下载 PDF-Extract-Kit 模型
```
...
...
@@ -28,7 +28,7 @@ ModelScope 支持SDK或模型下载,任选一个即可。
```
bash
git lfs
install
git lfs clone https://www.modelscope.cn/
wanderkid
/PDF-Extract-Kit.git
git lfs clone https://www.modelscope.cn/
opendatalab
/PDF-Extract-Kit.git
```
### 2)利用SDK下载
...
...
@@ -41,7 +41,7 @@ pip install modelscope
```
python
# 使用modelscope sdk下载模型
from
modelscope
import
snapshot_download
model_dir
=
snapshot_download
(
'
wanderkid
/PDF-Extract-Kit'
)
model_dir
=
snapshot_download
(
'
opendatalab
/PDF-Extract-Kit'
)
print
(
f
"模型文件下载路径为:{model_dir}/models"
)
```
...
...
magic_pdf/para/para_split_v2.py
View file @
6f58eeab
...
...
@@ -100,59 +100,62 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
if
lang
!=
'en'
:
return
lines
,
None
else
:
total_lines
=
len
(
lines
)
line_fea_encode
=
[]
"""
对每一行进行特征编码,编码规则如下:
1. 如果行顶格,且大写字母开头或者数字开头,编码为1
2. 如果顶格,其他非大写开头编码为4
3. 如果非顶格,首字符大写,编码为2
4. 如果非顶格,首字符非大写编码为3
"""
if
len
(
lines
)
>
0
:
x_map_tag_dict
,
min_x_tag
=
cluster_line_x
(
lines
)
for
l
in
lines
:
span_text
=
__get_span_text
(
l
[
'spans'
][
0
])
first_char
=
span_text
[
0
]
layout
=
__find_layout_bbox_by_line
(
l
[
'bbox'
],
new_layout_bboxes
)
if
not
layout
:
line_fea_encode
.
append
(
0
)
total_lines
=
len
(
lines
)
line_fea_encode
=
[]
"""
对每一行进行特征编码,编码规则如下:
1. 如果行顶格,且大写字母开头或者数字开头,编码为1
2. 如果顶格,其他非大写开头编码为4
3. 如果非顶格,首字符大写,编码为2
4. 如果非顶格,首字符非大写编码为3
"""
if
len
(
lines
)
>
0
:
x_map_tag_dict
,
min_x_tag
=
cluster_line_x
(
lines
)
for
l
in
lines
:
span_text
=
__get_span_text
(
l
[
'spans'
][
0
])
if
not
span_text
:
line_fea_encode
.
append
(
0
)
continue
first_char
=
span_text
[
0
]
layout
=
__find_layout_bbox_by_line
(
l
[
'bbox'
],
new_layout_bboxes
)
if
not
layout
:
line_fea_encode
.
append
(
0
)
else
:
#
if
x_map_tag_dict
[
round
(
l
[
'bbox'
][
0
])]
==
min_x_tag
:
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
if
not
first_char
.
isalnum
()
or
if_match_reference_list
(
span_text
):
line_fea_encode
.
append
(
1
)
else
:
line_fea_encode
.
append
(
4
)
else
:
#
if
x_map_tag_dict
[
round
(
l
[
'bbox'
][
0
])]
==
min_x_tag
:
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
if
not
first_char
.
isalnum
()
or
if_match_reference_list
(
span_text
):
line_fea_encode
.
append
(
1
)
else
:
line_fea_encode
.
append
(
4
)
if
first_char
.
isupper
():
line_fea_encode
.
append
(
2
)
else
:
if
first_char
.
isupper
():
line_fea_encode
.
append
(
2
)
else
:
line_fea_encode
.
append
(
3
)
line_fea_encode
.
append
(
3
)
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
list_indice
,
list_start_idx
=
find_repeating_patterns2
(
line_fea_encode
)
if
len
(
list_indice
)
>
0
:
list_indice
,
list_start_idx
=
find_repeating_patterns2
(
line_fea_encode
)
if
len
(
list_indice
)
>
0
:
if
debug_able
:
logger
.
info
(
f
"发现了列表,列表行数:{list_indice}, {list_start_idx}"
)
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments
=
[]
for
start
,
end
in
list_indice
:
for
i
in
range
(
start
,
end
+
1
):
if
i
>
0
:
if
line_fea_encode
[
i
]
==
4
:
if
debug_able
:
logger
.
info
(
f
"列表行的第{i}行不是顶格的"
)
break
else
:
if
debug_able
:
logger
.
info
(
f
"发现了列表,列表行数:{list_indice}, {list_start_idx}"
)
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments
=
[]
for
start
,
end
in
list_indice
:
for
i
in
range
(
start
,
end
+
1
):
if
i
>
0
:
if
line_fea_encode
[
i
]
==
4
:
if
debug_able
:
logger
.
info
(
f
"列表行的第{i}行不是顶格的"
)
break
else
:
if
debug_able
:
logger
.
info
(
f
"列表行的第{start}到第{end}行是列表"
)
logger
.
info
(
f
"列表行的第{start}到第{end}行是列表"
)
return
split_indices
(
total_lines
,
list_indice
),
list_start_idx
return
split_indices
(
total_lines
,
list_indice
),
list_start_idx
def
cluster_line_x
(
lines
:
list
)
->
dict
:
...
...
signatures/version1/cla.json
View file @
6f58eeab
...
...
@@ -31,6 +31,22 @@
"created_at"
:
"2024-08-13T12:23:16Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
418
},
{
"name"
:
"Matthijz98"
,
"id"
:
17087153
,
"comment_id"
:
2298912989
,
"created_at"
:
"2024-08-20T13:49:50Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
467
},
{
"name"
:
"strongerfly"
,
"id"
:
11643869
,
"comment_id"
:
2309481561
,
"created_at"
:
"2024-08-26T07:01:49Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
487
}
]
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment