Unverified Commit c8b06ad5 authored by myhloli's avatar myhloli Committed by GitHub

Merge branch 'master' into master

parents 88f5b932 2783bb39
...@@ -9,7 +9,7 @@ on: ...@@ -9,7 +9,7 @@ on:
paths-ignore: paths-ignore:
- "cmds/**" - "cmds/**"
- "**.md" - "**.md"
workflow_dispatch:
jobs: jobs:
pdf-test: pdf-test:
runs-on: pdf runs-on: pdf
...@@ -18,14 +18,16 @@ jobs: ...@@ -18,14 +18,16 @@ jobs:
fail-fast: true fail-fast: true
steps: steps:
- name: config-net
run: |
export http_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
export https_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
- name: PDF benchmark - name: PDF benchmark
uses: actions/checkout@v3 uses: actions/checkout@v3
with: with:
fetch-depth: 2 fetch-depth: 2
- name: check-requirements - name: check-requirements
run: | run: |
export http_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
export https_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
changed_files=$(git diff --name-only -r HEAD~1 HEAD) changed_files=$(git diff --name-only -r HEAD~1 HEAD)
echo $changed_files echo $changed_files
if [[ $changed_files =~ "requirements.txt" ]]; then if [[ $changed_files =~ "requirements.txt" ]]; then
...@@ -36,4 +38,12 @@ jobs: ...@@ -36,4 +38,12 @@ jobs:
- name: benchmark - name: benchmark
run: | run: |
echo "start test" echo "start test"
cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip output.json cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip badcase.json overall.json base_data.json
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
needs: [pdf-test]
runs-on: [pdf]
steps:
- name: notify
run: |
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: PDF name: update-base
on: on:
release: push:
types: [published] tags:
- '*released'
workflow_dispatch:
jobs: jobs:
pdf-test: pdf-test:
runs-on: pdf runs-on: pdf
...@@ -15,6 +16,7 @@ jobs: ...@@ -15,6 +16,7 @@ jobs:
steps: steps:
- name: update-base - name: update-base
uses: actions/checkout@v3 uses: actions/checkout@v3
- name: start-update
run: | run: |
python update_base.py echo "start test"
...@@ -116,6 +116,7 @@ if __name__ == '__main__': ...@@ -116,6 +116,7 @@ if __name__ == '__main__':
pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf" pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json" json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_local_parse(pdf_path, json_file_path) # ocr_local_parse(pdf_path, json_file_path)
book_name = "数学新星网/edu_00001236" book_name = "数学新星网/edu_00001236"
ocr_online_parse(book_name) ocr_online_parse(book_name)
......
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
class AbsReaderWriter(ABC): class AbsReaderWriter(ABC):
""" """
同时支持二进制和文本读写的抽象类 同时支持二进制和文本读写的抽象类
TODO
""" """
@abstractmethod MODE_TXT = "text"
def read(self, path: str): MODE_BIN = "binary"
pass
@abstractmethod
def write(self, path: str, content: str):
pass
def __init__(self, parent_path):
# 初始化代码可以在这里添加,如果需要的话
self.parent_path = parent_path # 对于本地目录是父目录,对于s3是会写到这个apth下。
@abstractmethod
def read(self, path: str, mode="text"):
"""
无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
"""
raise NotImplementedError
@abstractmethod
def write(self, content: str, path: str, mode=MODE_TXT):
"""
无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
"""
raise NotImplementedError
@abstractmethod
def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding='utf-8'):
"""
无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
"""
raise NotImplementedError
import os
from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
from loguru import logger
class DiskReaderWriter(AbsReaderWriter):
def __init__(self, parent_path, encoding='utf-8'):
self.path = parent_path
self.encoding = encoding
def read(self, mode="text"):
if not os.path.exists(self.path):
logger.error(f"文件 {self.path} 不存在")
raise Exception(f"文件 {self.path} 不存在")
if mode == "text":
with open(self.path, 'r', encoding = self.encoding) as f:
return f.read()
elif mode == "binary":
with open(self.path, 'rb') as f:
return f.read()
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def write(self, data, mode="text"):
if mode == "text":
with open(self.path, 'w', encoding=self.encoding) as f:
f.write(data)
logger.info(f"内容已成功写入 {self.path}")
elif mode == "binary":
with open(self.path, 'wb') as f:
f.write(data)
logger.info(f"内容已成功写入 {self.path}")
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
# 使用示例
if __name__ == "__main__":
file_path = "example.txt"
drw = DiskReaderWriter(file_path)
# 写入内容到文件
drw.write(b"Hello, World!", mode="binary")
# 从文件读取内容
content = drw.read()
if content:
logger.info(f"从 {file_path} 读取的内容: {content}")
from magic_pdf.io import AbsReaderWriter from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key
import boto3
from loguru import logger
from boto3.s3.transfer import TransferConfig
from botocore.config import Config
class DiskReaderWriter(AbsReaderWriter): class S3ReaderWriter(AbsReaderWriter):
def __init__(self, parent_path, encoding='utf-8'): def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
self.path = parent_path self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
self.encoding = encoding
def read(self): def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
with open(self.path, 'rb') as f: s3_client = boto3.client(
return f.read() service_name="s3",
aws_access_key_id=ak,
aws_secret_access_key=sk,
endpoint_url=endpoint_url,
config=Config(s3={"addressing_style": addressing_style},
retries={'max_attempts': 5, 'mode': 'standard'}),
)
return s3_client
def read(self, s3_path, mode="text", encoding="utf-8"):
bucket_name, bucket_key = parse_bucket_key(s3_path)
res = self.client.get_object(Bucket=bucket_name, Key=bucket_key)
body = res["Body"].read()
if mode == 'text':
data = body.decode(encoding) # Decode bytes to text
elif mode == 'binary':
data = body
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
return data
def write(self, data): def write(self, data, s3_path, mode="text", encoding="utf-8"):
with open(self.path, 'wb') as f: if mode == 'text':
f.write(data) body = data.encode(encoding) # Encode text data as bytes
elif mode == 'binary':
body = data
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
bucket_name, bucket_key = parse_bucket_key(s3_path)
self.client.put_object(Body=body, Bucket=bucket_name, Key=bucket_key)
logger.info(f"内容已写入 {s3_path} ")
if __name__ == "__main__":
# Config the connection info
ak = ""
sk = ""
endpoint_url = ""
addressing_style = ""
# Create an S3ReaderWriter object
s3_reader_writer = S3ReaderWriter(ak, sk, endpoint_url, addressing_style)
# Write text data to S3
text_data = "This is some text data"
s3_reader_writer.write(data=text_data, s3_path = "s3://bucket_name/ebook/test/test.json", mode='text')
# Read text data from S3
text_data_read = s3_reader_writer.read(s3_path = "s3://bucket_name/ebook/test/test.json", mode='text')
logger.info(f"Read text data from S3: {text_data_read}")
# Write binary data to S3
binary_data = b"This is some binary data"
s3_reader_writer.write(data=text_data, s3_path = "s3://bucket_name/ebook/test/test2.json", mode='binary')
# Read binary data from S3
binary_data_read = s3_reader_writer.read(s3_path = "s3://bucket_name/ebook/test/test2.json", mode='binary')
logger.info(f"Read binary data from S3: {binary_data_read}")
\ No newline at end of file
...@@ -183,11 +183,31 @@ def __valign_lines(blocks, layout_bboxes): ...@@ -183,11 +183,31 @@ def __valign_lines(blocks, layout_bboxes):
return new_layout_bboxes return new_layout_bboxes
def __align_text_in_layout(blocks, layout_bboxes):
"""
由于ocr出来的line,有时候会在前后有一段空白,这个时候需要对文本进行对齐,超出的部分被layout左右侧截断。
"""
for layout in layout_bboxes:
lb = layout['layout_bbox']
blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], lb)]
if len(blocks_in_layoutbox)==0:
continue
for block in blocks_in_layoutbox:
for line in block['lines']:
x0, x1 = line['bbox'][0], line['bbox'][2]
if x0 < lb[0]:
line['bbox'][0] = lb[0]
if x1 > lb[2]:
line['bbox'][2] = lb[2]
def __common_pre_proc(blocks, layout_bboxes): def __common_pre_proc(blocks, layout_bboxes):
""" """
不分语言的,对文本进行预处理 不分语言的,对文本进行预处理
""" """
#__add_line_period(blocks, layout_bboxes) #__add_line_period(blocks, layout_bboxes)
__align_text_in_layout(blocks, layout_bboxes)
aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes) aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
return aligned_layout_bboxes return aligned_layout_bboxes
...@@ -233,7 +253,6 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_ ...@@ -233,7 +253,6 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
layout_paras = [] layout_paras = []
right_tail_distance = 1.5 * char_avg_len right_tail_distance = 1.5 * char_avg_len
for lines in lines_group: for lines in lines_group:
paras = [] paras = []
total_lines = len(lines) total_lines = len(lines)
......
{
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1_score": 1.0,
"pdf间的平均编辑距离": 133.10256410256412,
"pdf间的平均bleu": 0.28838311595434046,
"分段准确率": 0.07220216606498195,
"行内公式准确率": {
"accuracy": 0.004835727492533068,
"precision": 0.008790072388831437,
"recall": 0.010634970284641852,
"f1_score": 0.009624911535739562
},
"行内公式编辑距离": 1.6176470588235294,
"行内公式bleu": 0.17154724654721457,
"行间公式准确率": {
"accuracy": 0.08490566037735849,
"precision": 0.1836734693877551,
"recall": 0.13636363636363635,
"f1_score": 0.1565217391304348
},
"行间公式编辑距离": 113.22222222222223,
"行间公式bleu": 0.2531053359913409,
"丢弃文本准确率": {
"accuracy": 0.00035398230088495576,
"precision": 0.0006389776357827476,
"recall": 0.0007930214115781126,
"f1_score": 0.0007077140835102619
},
"丢弃文本标签准确率": {
"color_background_header_txt_block": {
"precision": 0.0,
"recall": 0.0,
"f1-score": 0.0,
"support": 41.0
},
"header": {
"precision": 0.0,
"recall": 0.0,
"f1-score": 0.0,
"support": 4.0
},
"footnote": {
"precision": 1.0,
"recall": 0.009708737864077669,
"f1-score": 0.019230769230769232,
"support": 103.0
},
"on-table": {
"precision": 0.0,
"recall": 0.0,
"f1-score": 0.0,
"support": 665.0
},
"rotate": {
"precision": 0.0,
"recall": 0.0,
"f1-score": 0.0,
"support": 63.0
},
"on-image": {
"precision": 0.0,
"recall": 0.0,
"f1-score": 0.0,
"support": 380.0
},
"micro avg": {
"precision": 1.0,
"recall": 0.0007961783439490446,
"f1-score": 0.0015910898965791568,
"support": 1256.0
}
},
"丢弃图片准确率": {
"accuracy": 0.0,
"precision": 0.0,
"recall": 0.0,
"f1_score": 0.0
},
"丢弃表格准确率": {
"accuracy": 0.0,
"precision": 0.0,
"recall": 0.0,
"f1_score": 0.0
}
}
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment