Merge branch 'master' into master

c8b06ad5 · myhloli · GitHub · 88f5b932 · 2783bb39 · c8b06ad5
Unverified Commit c8b06ad5 authored Apr 10, 2024 by myhloli Committed by GitHub Apr 10, 2024
9 changed files
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -9,7 +9,7 @@ on:
    paths-ignore:
      - "cmds/**"
      - "**.md"
+  workflow_dispatch:
 jobs:
  pdf-test:
    runs-on: pdf
@@ -18,14 +18,16 @@ jobs:
      fail-fast: true
    steps:
+    - name: config-net
+      run: |
+        export http_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
+        export https_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
    - name: PDF benchmark
      uses: actions/checkout@v3
      with:
        fetch-depth: 2
    - name: check-requirements
      run: |
-        export http_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
-        export https_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
        changed_files=$(git diff --name-only -r HEAD~1 HEAD)
        echo $changed_files
        if [[ $changed_files =~ "requirements.txt" ]]; then
@@ -36,4 +38,12 @@ jobs:
    - name: benchmark
      run: |
        echo "start test"
-        cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip output.json
+        cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip badcase.json overall.json base_data.json
+  notify_to_feishu:
+    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
+    needs: [pdf-test]
+    runs-on: [pdf]
+    steps:
+    - name: notify
+      run: |
+        curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}'  ${{ secrets.WEBHOOK_URL }}
--- a/.github/workflows/update_base.yml
+++ b/.github/workflows/update_base.yml
 # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
-name: PDF
+name: update-base
 on:
-release:
+  push:
-  types: [published]
+    tags:
+      - '*released'
+  workflow_dispatch:
 jobs:
  pdf-test:
    runs-on: pdf
@@ -15,6 +16,7 @@ jobs:
    steps:
    - name: update-base
      uses: actions/checkout@v3
+    - name: start-update
      run: |
-          python update_base.py
+        echo "start test"
--- a/demo/ocr_demo.py
+++ b/demo/ocr_demo.py
@@ -116,6 +116,7 @@ if __name__ == '__main__':
    pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
    json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
    # ocr_local_parse(pdf_path, json_file_path)
    book_name = "数学新星网/edu_00001236"
    ocr_online_parse(book_name)

--- a/magic_pdf/io/AbsReaderWriter.py
+++ b/magic_pdf/io/AbsReaderWriter.py
 from abc import ABC, abstractmethod
 class AbsReaderWriter(ABC):
    """
    同时支持二进制和文本读写的抽象类
-    TODO
    """
-    @abstractmethod
+    MODE_TXT = "text"
-    def read(self, path: str):
+    MODE_BIN = "binary"
-        pass
-    @abstractmethod
-    def write(self, path: str, content: str):
-        pass
+    def __init__(self, parent_path):
+        # 初始化代码可以在这里添加，如果需要的话
+        self.parent_path = parent_path # 对于本地目录是父目录，对于s3是会写到这个apth下。
+    @abstractmethod
+    def read(self, path: str, mode="text"):
+        """
+        无论对于本地还是s3的路径，检查如果path是绝对路径，那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def write(self, content: str, path: str, mode=MODE_TXT):
+        """
+        无论对于本地还是s3的路径，检查如果path是绝对路径，那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding='utf-8'):
+        """
+        无论对于本地还是s3的路径，检查如果path是绝对路径，那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
+        """
+        raise NotImplementedError
--- a/magic_pdf/io/DiskReaderWriter.py
+++ b/magic_pdf/io/DiskReaderWriter.py
+import os
+from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
+from loguru import logger
+class DiskReaderWriter(AbsReaderWriter):
+    def __init__(self, parent_path, encoding='utf-8'):
+        self.path = parent_path
+        self.encoding = encoding
+    def read(self, mode="text"):
+        if not os.path.exists(self.path):
+            logger.error(f"文件 {self.path} 不存在")
+            raise Exception(f"文件 {self.path} 不存在")
+        if mode == "text":
+            with open(self.path, 'r', encoding = self.encoding) as f:
+                return f.read()
+        elif mode == "binary":
+            with open(self.path, 'rb') as f:
+                return f.read()
+        else:
+            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
+    def write(self, data, mode="text"):
+        if mode == "text":
+            with open(self.path, 'w', encoding=self.encoding) as f:
+                f.write(data)
+                logger.info(f"内容已成功写入 {self.path}")
+        elif mode == "binary":
+            with open(self.path, 'wb') as f:
+                f.write(data)
+                logger.info(f"内容已成功写入 {self.path}")
+        else:
+            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
+# 使用示例
+if __name__ == "__main__":
+    file_path = "example.txt"
+    drw = DiskReaderWriter(file_path)
+    # 写入内容到文件
+    drw.write(b"Hello, World!", mode="binary")
+    # 从文件读取内容
+    content = drw.read()
+    if content:
+        logger.info(f"从 {file_path} 读取的内容: {content}")
--- a/magic_pdf/io/S3ReaderWriter.py
+++ b/magic_pdf/io/S3ReaderWriter.py
-from magic_pdf.io import AbsReaderWriter
+from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key
+import boto3
+from loguru import logger
+from boto3.s3.transfer import TransferConfig
+from botocore.config import Config
-class DiskReaderWriter(AbsReaderWriter):
+class S3ReaderWriter(AbsReaderWriter):
-    def __init__(self, parent_path, encoding='utf-8'):
+    def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
-        self.path = parent_path
+        self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
-        self.encoding = encoding
-    def read(self):
+    def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
-        with open(self.path, 'rb') as f:
+        s3_client = boto3.client(
-            return f.read()
+            service_name="s3",
+            aws_access_key_id=ak,
+            aws_secret_access_key=sk,
+            endpoint_url=endpoint_url,
+            config=Config(s3={"addressing_style": addressing_style},
+                          retries={'max_attempts': 5, 'mode': 'standard'}),
+        )
+        return s3_client
+    def read(self, s3_path, mode="text", encoding="utf-8"):
+        bucket_name, bucket_key = parse_bucket_key(s3_path)
+        res = self.client.get_object(Bucket=bucket_name, Key=bucket_key)
+        body = res["Body"].read()
+        if mode == 'text':
+            data = body.decode(encoding)  # Decode bytes to text
+        elif mode == 'binary':
+            data = body
+        else:
+            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
+        return data
-    def write(self, data):
+    def write(self, data, s3_path, mode="text", encoding="utf-8"):
-        with open(self.path, 'wb') as f:
+        if mode == 'text':
-            f.write(data)
+            body = data.encode(encoding)  # Encode text data as bytes
+        elif mode == 'binary':
+            body = data
+        else:
+            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
+        bucket_name, bucket_key = parse_bucket_key(s3_path)
+        self.client.put_object(Body=body, Bucket=bucket_name, Key=bucket_key)
+        logger.info(f"内容已写入 {s3_path} ")
+if __name__ == "__main__":
+    # Config the connection info
+    ak = ""
+    sk = ""
+    endpoint_url = ""
+    addressing_style = ""
+    # Create an S3ReaderWriter object
+    s3_reader_writer = S3ReaderWriter(ak, sk, endpoint_url, addressing_style)
+    # Write text data to S3
+    text_data = "This is some text data"
+    s3_reader_writer.write(data=text_data, s3_path = "s3://bucket_name/ebook/test/test.json", mode='text')
+    # Read text data from S3
+    text_data_read = s3_reader_writer.read(s3_path = "s3://bucket_name/ebook/test/test.json", mode='text')
+    logger.info(f"Read text data from S3: {text_data_read}")
+    # Write binary data to S3
+    binary_data = b"This is some binary data"
+    s3_reader_writer.write(data=text_data, s3_path = "s3://bucket_name/ebook/test/test2.json", mode='binary')
+    # Read binary data from S3
+    binary_data_read = s3_reader_writer.read(s3_path = "s3://bucket_name/ebook/test/test2.json", mode='binary')
+    logger.info(f"Read binary data from S3: {binary_data_read}")
\ No newline at end of file
--- a/magic_pdf/para/para_split.py
+++ b/magic_pdf/para/para_split.py
@@ -183,11 +183,31 @@ def __valign_lines(blocks, layout_bboxes):
    return new_layout_bboxes
+def __align_text_in_layout(blocks, layout_bboxes):
+    """
+    由于ocr出来的line，有时候会在前后有一段空白，这个时候需要对文本进行对齐，超出的部分被layout左右侧截断。
+    """
+    for layout in layout_bboxes:
+        lb = layout['layout_bbox']
+        blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], lb)]
+        if len(blocks_in_layoutbox)==0:
+            continue
+        for block in blocks_in_layoutbox:
+            for line in block['lines']:
+                x0, x1 = line['bbox'][0], line['bbox'][2]
+                if x0 < lb[0]:
+                    line['bbox'][0] = lb[0]
+                if x1 > lb[2]:
+                    line['bbox'][2] = lb[2]
 def __common_pre_proc(blocks, layout_bboxes):
    """
    不分语言的，对文本进行预处理
    """
    #__add_line_period(blocks, layout_bboxes)
+    __align_text_in_layout(blocks, layout_bboxes)
    aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
    return aligned_layout_bboxes
@@ -233,7 +253,6 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
    layout_paras = []
    right_tail_distance = 1.5 * char_avg_len
    for lines in lines_group:
        paras = []
        total_lines = len(lines)

--- a/tools/base_data.json
+++ b/tools/base_data.json
+{
+    "accuracy": 1.0,
+    "precision": 1.0,
+    "recall": 1.0,
+    "f1_score": 1.0,
+    "pdf间的平均编辑距离": 133.10256410256412,
+    "pdf间的平均bleu": 0.28838311595434046,
+    "分段准确率": 0.07220216606498195,
+    "行内公式准确率": {
+        "accuracy": 0.004835727492533068,
+        "precision": 0.008790072388831437,
+        "recall": 0.010634970284641852,
+        "f1_score": 0.009624911535739562
+    },
+    "行内公式编辑距离": 1.6176470588235294,
+    "行内公式bleu": 0.17154724654721457,
+    "行间公式准确率": {
+        "accuracy": 0.08490566037735849,
+        "precision": 0.1836734693877551,
+        "recall": 0.13636363636363635,
+        "f1_score": 0.1565217391304348
+    },
+    "行间公式编辑距离": 113.22222222222223,
+    "行间公式bleu": 0.2531053359913409,
+    "丢弃文本准确率": {
+        "accuracy": 0.00035398230088495576,
+        "precision": 0.0006389776357827476,
+        "recall": 0.0007930214115781126,
+        "f1_score": 0.0007077140835102619
+    },
+    "丢弃文本标签准确率": {
+        "color_background_header_txt_block": {
+            "precision": 0.0,
+            "recall": 0.0,
+            "f1-score": 0.0,
+            "support": 41.0
+        },
+        "header": {
+            "precision": 0.0,
+            "recall": 0.0,
+            "f1-score": 0.0,
+            "support": 4.0
+        },
+        "footnote": {
+            "precision": 1.0,
+            "recall": 0.009708737864077669,
+            "f1-score": 0.019230769230769232,
+            "support": 103.0
+        },
+        "on-table": {
+            "precision": 0.0,
+            "recall": 0.0,
+            "f1-score": 0.0,
+            "support": 665.0
+        },
+        "rotate": {
+            "precision": 0.0,
+            "recall": 0.0,
+            "f1-score": 0.0,
+            "support": 63.0
+        },
+        "on-image": {
+            "precision": 0.0,
+            "recall": 0.0,
+            "f1-score": 0.0,
+            "support": 380.0
+        },
+        "micro avg": {
+            "precision": 1.0,
+            "recall": 0.0007961783439490446,
+            "f1-score": 0.0015910898965791568,
+            "support": 1256.0
+        }
+    },
+    "丢弃图片准确率": {
+        "accuracy": 0.0,
+        "precision": 0.0,
+        "recall": 0.0,
+        "f1_score": 0.0
+    },
+    "丢弃表格准确率": {
+        "accuracy": 0.0,
+        "precision": 0.0,
+        "recall": 0.0,
+        "f1_score": 0.0
+    }
+}
\ No newline at end of file
--- a/tools/ocr_badcase.py
+++ b/tools/ocr_badcase.py