merge: sync from master branch

6f58eeab · drunkpig · 9067cd31 · 7f0fe200 · 6f58eeab · 6f58eeab
Commit 6f58eeab authored Aug 28, 2024 by drunkpig
7 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -48,4 +48,4 @@ RUN /bin/bash -c "pip3 install modelscope && \
    sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
 # Set the entry point to activate the virtual environment and run the command line tool
 ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
\ No newline at end of file
--- a/README_zh-CN.md.bak
+++ b/README_zh-CN.md.bak
--- a/docs/download_models.py
+++ b/docs/download_models.py
+# use modelscope sdk download models
+from modelscope import snapshot_download
+model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
+print(f"model dir is: {model_dir}/models")
--- a/docs/how_to_download_models_en.md
+++ b/docs/how_to_download_models_en.md
@@ -9,7 +9,7 @@ git lfs install
 To download the `PDF-Extract-Kit` model from Hugging Face, use the following command:
 ```bash
-git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit
+git lfs clone https://huggingface.co/opendatalab/PDF-Extract-Kit
 ```
 Ensure that Git LFS is enabled during the clone to properly download all large files.

--- a/docs/how_to_download_models_zh_cn.md
+++ b/docs/how_to_download_models_zh_cn.md
@@ -13,7 +13,7 @@
 ```bash
 git lfs install # 安装 Git 大文件存储插件 (Git LFS) 
-git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit # 从 Hugging Face 下载 PDF-Extract-Kit 模型
+git lfs clone https://huggingface.co/opendatalab/PDF-Extract-Kit # 从 Hugging Face 下载 PDF-Extract-Kit 模型
 ```
@@ -28,7 +28,7 @@ ModelScope 支持SDK或模型下载，任选一个即可。
 ```bash
 git lfs install
-git lfs clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git
+git lfs clone https://www.modelscope.cn/opendatalab/PDF-Extract-Kit.git
 ```
 ### 2）利用SDK下载
@@ -41,7 +41,7 @@ pip install modelscope
 ```python
 # 使用modelscope sdk下载模型
 from modelscope import snapshot_download
-model_dir = snapshot_download('wanderkid/PDF-Extract-Kit')
+model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
 print(f"模型文件下载路径为：{model_dir}/models")
 ```

--- a/magic_pdf/para/para_split_v2.py
+++ b/magic_pdf/para/para_split_v2.py
@@ -100,59 +100,62 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
    if lang != 'en':
        return lines, None
-    else:
-        total_lines = len(lines)
+    total_lines = len(lines)
-        line_fea_encode = []
+    line_fea_encode = []
-        """
+    """
-        对每一行进行特征编码，编码规则如下：
+    对每一行进行特征编码，编码规则如下：
-        1. 如果行顶格，且大写字母开头或者数字开头，编码为1
+    1. 如果行顶格，且大写字母开头或者数字开头，编码为1
-        2. 如果顶格，其他非大写开头编码为4
+    2. 如果顶格，其他非大写开头编码为4
-        3. 如果非顶格，首字符大写，编码为2
+    3. 如果非顶格，首字符大写，编码为2
-        4. 如果非顶格，首字符非大写编码为3
+    4. 如果非顶格，首字符非大写编码为3
-        """
+    """
-        if len(lines) > 0:
+    if len(lines) > 0:
-            x_map_tag_dict, min_x_tag = cluster_line_x(lines)
+        x_map_tag_dict, min_x_tag = cluster_line_x(lines)
-        for l in lines:
+    for l in lines:
-            span_text = __get_span_text(l['spans'][0])
+        span_text = __get_span_text(l['spans'][0])
-            first_char = span_text[0]
+        if not span_text:
-            layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
+            line_fea_encode.append(0)
-            if not layout:
+            continue
-                line_fea_encode.append(0)
+        first_char = span_text[0]
+        layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
+        if not layout:
+            line_fea_encode.append(0)
+        else:
+            #
+            if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
+                # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
+                if not first_char.isalnum() or if_match_reference_list(span_text):
+                    line_fea_encode.append(1)
+                else:
+                    line_fea_encode.append(4)
            else:
-                #
+                if first_char.isupper():
-                if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
+                    line_fea_encode.append(2)
-                    # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
-                    if not first_char.isalnum() or if_match_reference_list(span_text):
-                        line_fea_encode.append(1)
-                    else:
-                        line_fea_encode.append(4)
                else:
-                    if first_char.isupper():
+                    line_fea_encode.append(3)
-                        line_fea_encode.append(2)
-                    else:
-                        line_fea_encode.append(3)
-        # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
+    # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
-        list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
+    list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
-        if len(list_indice) > 0:
+    if len(list_indice) > 0:
+        if debug_able:
+            logger.info(f"发现了列表，列表行数：{list_indice}， {list_start_idx}")
+    # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
+    segments = []
+    for start, end in list_indice:
+        for i in range(start, end + 1):
+            if i > 0:
+                if line_fea_encode[i] == 4:
+                    if debug_able:
+                        logger.info(f"列表行的第{i}行不是顶格的")
+                    break
+        else:
            if debug_able:
-                logger.info(f"发现了列表，列表行数：{list_indice}， {list_start_idx}")
+                logger.info(f"列表行的第{start}到第{end}行是列表")
-        # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
-        segments = []
-        for start, end in list_indice:
-            for i in range(start, end + 1):
-                if i > 0:
-                    if line_fea_encode[i] == 4:
-                        if debug_able:
-                            logger.info(f"列表行的第{i}行不是顶格的")
-                        break
-            else:
-                if debug_able:
-                    logger.info(f"列表行的第{start}到第{end}行是列表")
-        return split_indices(total_lines, list_indice), list_start_idx
+    return split_indices(total_lines, list_indice), list_start_idx
 def cluster_line_x(lines: list) -> dict:

--- a/signatures/version1/cla.json
+++ b/signatures/version1/cla.json
@@ -31,6 +31,22 @@
      "created_at": "2024-08-13T12:23:16Z",
      "repoId": 765083837,
      "pullRequestNo": 418
+    },
+    {
+      "name": "Matthijz98",
+      "id": 17087153,
+      "comment_id": 2298912989,
+      "created_at": "2024-08-20T13:49:50Z",
+      "repoId": 765083837,
+      "pullRequestNo": 467
+    },
+    {
+      "name": "strongerfly",
+      "id": 11643869,
+      "comment_id": 2309481561,
+      "created_at": "2024-08-26T07:01:49Z",
+      "repoId": 765083837,
+      "pullRequestNo": 487
    }
  ]
 }
\ No newline at end of file