Commit 6f58eeab authored by drunkpig's avatar drunkpig

merge: sync from master branch

parents 9067cd31 7f0fe200
......@@ -48,4 +48,4 @@ RUN /bin/bash -c "pip3 install modelscope && \
sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
# Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
\ No newline at end of file
This diff is collapsed.
# use modelscope sdk download models
from modelscope import snapshot_download
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
print(f"model dir is: {model_dir}/models")
......@@ -9,7 +9,7 @@ git lfs install
To download the `PDF-Extract-Kit` model from Hugging Face, use the following command:
```bash
git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit
git lfs clone https://huggingface.co/opendatalab/PDF-Extract-Kit
```
Ensure that Git LFS is enabled during the clone to properly download all large files.
......
......@@ -13,7 +13,7 @@
```bash
git lfs install # 安装 Git 大文件存储插件 (Git LFS)
git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit # 从 Hugging Face 下载 PDF-Extract-Kit 模型
git lfs clone https://huggingface.co/opendatalab/PDF-Extract-Kit # 从 Hugging Face 下载 PDF-Extract-Kit 模型
```
......@@ -28,7 +28,7 @@ ModelScope 支持SDK或模型下载,任选一个即可。
```bash
git lfs install
git lfs clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git
git lfs clone https://www.modelscope.cn/opendatalab/PDF-Extract-Kit.git
```
### 2)利用SDK下载
......@@ -41,7 +41,7 @@ pip install modelscope
```python
# 使用modelscope sdk下载模型
from modelscope import snapshot_download
model_dir = snapshot_download('wanderkid/PDF-Extract-Kit')
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
print(f"模型文件下载路径为:{model_dir}/models")
```
......
......@@ -100,59 +100,62 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
if lang != 'en':
return lines, None
else:
total_lines = len(lines)
line_fea_encode = []
"""
对每一行进行特征编码,编码规则如下:
1. 如果行顶格,且大写字母开头或者数字开头,编码为1
2. 如果顶格,其他非大写开头编码为4
3. 如果非顶格,首字符大写,编码为2
4. 如果非顶格,首字符非大写编码为3
"""
if len(lines) > 0:
x_map_tag_dict, min_x_tag = cluster_line_x(lines)
for l in lines:
span_text = __get_span_text(l['spans'][0])
first_char = span_text[0]
layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
if not layout:
line_fea_encode.append(0)
total_lines = len(lines)
line_fea_encode = []
"""
对每一行进行特征编码,编码规则如下:
1. 如果行顶格,且大写字母开头或者数字开头,编码为1
2. 如果顶格,其他非大写开头编码为4
3. 如果非顶格,首字符大写,编码为2
4. 如果非顶格,首字符非大写编码为3
"""
if len(lines) > 0:
x_map_tag_dict, min_x_tag = cluster_line_x(lines)
for l in lines:
span_text = __get_span_text(l['spans'][0])
if not span_text:
line_fea_encode.append(0)
continue
first_char = span_text[0]
layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
if not layout:
line_fea_encode.append(0)
else:
#
if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
if not first_char.isalnum() or if_match_reference_list(span_text):
line_fea_encode.append(1)
else:
line_fea_encode.append(4)
else:
#
if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
if not first_char.isalnum() or if_match_reference_list(span_text):
line_fea_encode.append(1)
else:
line_fea_encode.append(4)
if first_char.isupper():
line_fea_encode.append(2)
else:
if first_char.isupper():
line_fea_encode.append(2)
else:
line_fea_encode.append(3)
line_fea_encode.append(3)
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
if len(list_indice) > 0:
list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
if len(list_indice) > 0:
if debug_able:
logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments = []
for start, end in list_indice:
for i in range(start, end + 1):
if i > 0:
if line_fea_encode[i] == 4:
if debug_able:
logger.info(f"列表行的第{i}行不是顶格的")
break
else:
if debug_able:
logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments = []
for start, end in list_indice:
for i in range(start, end + 1):
if i > 0:
if line_fea_encode[i] == 4:
if debug_able:
logger.info(f"列表行的第{i}行不是顶格的")
break
else:
if debug_able:
logger.info(f"列表行的第{start}到第{end}行是列表")
logger.info(f"列表行的第{start}到第{end}行是列表")
return split_indices(total_lines, list_indice), list_start_idx
return split_indices(total_lines, list_indice), list_start_idx
def cluster_line_x(lines: list) -> dict:
......
......@@ -31,6 +31,22 @@
"created_at": "2024-08-13T12:23:16Z",
"repoId": 765083837,
"pullRequestNo": 418
},
{
"name": "Matthijz98",
"id": 17087153,
"comment_id": 2298912989,
"created_at": "2024-08-20T13:49:50Z",
"repoId": 765083837,
"pullRequestNo": 467
},
{
"name": "strongerfly",
"id": 11643869,
"comment_id": 2309481561,
"created_at": "2024-08-26T07:01:49Z",
"repoId": 765083837,
"pullRequestNo": 487
}
]
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment