Commit 6f58eeab authored by drunkpig's avatar drunkpig

merge: sync from master branch

parents 9067cd31 7f0fe200
...@@ -48,4 +48,4 @@ RUN /bin/bash -c "pip3 install modelscope && \ ...@@ -48,4 +48,4 @@ RUN /bin/bash -c "pip3 install modelscope && \
sed -i 's|cpu|cuda|g' /root/magic-pdf.json" sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
# Set the entry point to activate the virtual environment and run the command line tool # Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"] ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
\ No newline at end of file
This diff is collapsed.
# use modelscope sdk download models
from modelscope import snapshot_download
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
print(f"model dir is: {model_dir}/models")
...@@ -9,7 +9,7 @@ git lfs install ...@@ -9,7 +9,7 @@ git lfs install
To download the `PDF-Extract-Kit` model from Hugging Face, use the following command: To download the `PDF-Extract-Kit` model from Hugging Face, use the following command:
```bash ```bash
git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit git lfs clone https://huggingface.co/opendatalab/PDF-Extract-Kit
``` ```
Ensure that Git LFS is enabled during the clone to properly download all large files. Ensure that Git LFS is enabled during the clone to properly download all large files.
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
```bash ```bash
git lfs install # 安装 Git 大文件存储插件 (Git LFS) git lfs install # 安装 Git 大文件存储插件 (Git LFS)
git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit # 从 Hugging Face 下载 PDF-Extract-Kit 模型 git lfs clone https://huggingface.co/opendatalab/PDF-Extract-Kit # 从 Hugging Face 下载 PDF-Extract-Kit 模型
``` ```
...@@ -28,7 +28,7 @@ ModelScope 支持SDK或模型下载,任选一个即可。 ...@@ -28,7 +28,7 @@ ModelScope 支持SDK或模型下载,任选一个即可。
```bash ```bash
git lfs install git lfs install
git lfs clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git git lfs clone https://www.modelscope.cn/opendatalab/PDF-Extract-Kit.git
``` ```
### 2)利用SDK下载 ### 2)利用SDK下载
...@@ -41,7 +41,7 @@ pip install modelscope ...@@ -41,7 +41,7 @@ pip install modelscope
```python ```python
# 使用modelscope sdk下载模型 # 使用modelscope sdk下载模型
from modelscope import snapshot_download from modelscope import snapshot_download
model_dir = snapshot_download('wanderkid/PDF-Extract-Kit') model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
print(f"模型文件下载路径为:{model_dir}/models") print(f"模型文件下载路径为:{model_dir}/models")
``` ```
......
...@@ -100,59 +100,62 @@ def __detect_list_lines(lines, new_layout_bboxes, lang): ...@@ -100,59 +100,62 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
if lang != 'en': if lang != 'en':
return lines, None return lines, None
else:
total_lines = len(lines) total_lines = len(lines)
line_fea_encode = [] line_fea_encode = []
""" """
对每一行进行特征编码,编码规则如下: 对每一行进行特征编码,编码规则如下:
1. 如果行顶格,且大写字母开头或者数字开头,编码为1 1. 如果行顶格,且大写字母开头或者数字开头,编码为1
2. 如果顶格,其他非大写开头编码为4 2. 如果顶格,其他非大写开头编码为4
3. 如果非顶格,首字符大写,编码为2 3. 如果非顶格,首字符大写,编码为2
4. 如果非顶格,首字符非大写编码为3 4. 如果非顶格,首字符非大写编码为3
""" """
if len(lines) > 0: if len(lines) > 0:
x_map_tag_dict, min_x_tag = cluster_line_x(lines) x_map_tag_dict, min_x_tag = cluster_line_x(lines)
for l in lines: for l in lines:
span_text = __get_span_text(l['spans'][0]) span_text = __get_span_text(l['spans'][0])
first_char = span_text[0] if not span_text:
layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes) line_fea_encode.append(0)
if not layout: continue
line_fea_encode.append(0) first_char = span_text[0]
layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
if not layout:
line_fea_encode.append(0)
else:
#
if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
if not first_char.isalnum() or if_match_reference_list(span_text):
line_fea_encode.append(1)
else:
line_fea_encode.append(4)
else: else:
# if first_char.isupper():
if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag: line_fea_encode.append(2)
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
if not first_char.isalnum() or if_match_reference_list(span_text):
line_fea_encode.append(1)
else:
line_fea_encode.append(4)
else: else:
if first_char.isupper(): line_fea_encode.append(3)
line_fea_encode.append(2)
else:
line_fea_encode.append(3)
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。 # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode) list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
if len(list_indice) > 0: if len(list_indice) > 0:
if debug_able:
logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments = []
for start, end in list_indice:
for i in range(start, end + 1):
if i > 0:
if line_fea_encode[i] == 4:
if debug_able:
logger.info(f"列表行的第{i}行不是顶格的")
break
else:
if debug_able: if debug_able:
logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}") logger.info(f"列表行的第{start}到第{end}行是列表")
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments = []
for start, end in list_indice:
for i in range(start, end + 1):
if i > 0:
if line_fea_encode[i] == 4:
if debug_able:
logger.info(f"列表行的第{i}行不是顶格的")
break
else:
if debug_able:
logger.info(f"列表行的第{start}到第{end}行是列表")
return split_indices(total_lines, list_indice), list_start_idx return split_indices(total_lines, list_indice), list_start_idx
def cluster_line_x(lines: list) -> dict: def cluster_line_x(lines: list) -> dict:
......
...@@ -31,6 +31,22 @@ ...@@ -31,6 +31,22 @@
"created_at": "2024-08-13T12:23:16Z", "created_at": "2024-08-13T12:23:16Z",
"repoId": 765083837, "repoId": 765083837,
"pullRequestNo": 418 "pullRequestNo": 418
},
{
"name": "Matthijz98",
"id": 17087153,
"comment_id": 2298912989,
"created_at": "2024-08-20T13:49:50Z",
"repoId": 765083837,
"pullRequestNo": 467
},
{
"name": "strongerfly",
"id": 11643869,
"comment_id": 2309481561,
"created_at": "2024-08-26T07:01:49Z",
"repoId": 765083837,
"pullRequestNo": 487
} }
] ]
} }
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment