Commit 90216330 authored by Shuimo's avatar Shuimo

update the text_badcase script and add auto upload s3 function

parents ad8941e3 ff8f62aa
......@@ -5,7 +5,13 @@ name: PDF
on:
push:
branches:
- master
- "master"
paths-ignore:
- "cmds/**"
- "**.md"
pull_request:
branches:
- "master"
paths-ignore:
- "cmds/**"
- "**.md"
......@@ -18,14 +24,16 @@ jobs:
fail-fast: true
steps:
- name: config-net
run: |
export http_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
export https_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
- name: PDF benchmark
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: check-requirements
run: |
export http_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
export https_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
changed_files=$(git diff --name-only -r HEAD~1 HEAD)
echo $changed_files
if [[ $changed_files =~ "requirements.txt" ]]; then
......@@ -36,12 +44,17 @@ jobs:
- name: benchmark
run: |
echo "start test"
cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip output.json
cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip badcase.json overall.json base_data.json
notify_to_feishu:
if: ${{ (github.ref_name == 'master') }}
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
needs: [pdf-test]
runs-on: [pdf]
steps:
- name: notify
run: |
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}
curl ${{ secrets.WEBHOOK_URL }} -H 'Content-Type: application/json' -d '{
"msgtype": "text",
"text": {
"content": "'${{ github.repository }}' GitHubAction Failed!\n 细节请查看:https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"
}
}'
......@@ -115,8 +115,9 @@ def ocr_parse_pdf_core(pdf_bytes, model_output_json_list, book_name, start_page_
if __name__ == '__main__':
pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
ocr_local_parse(pdf_path, json_file_path)
# book_name = "科数网/edu_00011318"
# ocr_online_parse(book_name)
# ocr_local_parse(pdf_path, json_file_path)
book_name = "数学新星网/edu_00001236"
ocr_online_parse(book_name)
pass
from magic_pdf.libs.commons import s3_image_save_path, join_path
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.libs.ocr_content_type import ContentType
......@@ -56,7 +55,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
if not span.get('image_path'):
continue
else:
content = f"![]({join_path(s3_image_save_path, span['image_path'])})"
content = f"![]({span['image_path']})"
else:
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
if span['type'] == ContentType.InlineEquation:
......@@ -123,7 +122,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]:
if mode == 'mm':
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
content = f"\n![]({span['image_path']})\n"
elif mode == 'nlp':
pass
if content != '':
......@@ -195,13 +194,13 @@ def line_to_standard_format(line):
if span['type'] == ContentType.Image:
content = {
'type': 'image',
'img_path': join_path(s3_image_save_path, span['image_path'])
'img_path': span['image_path']
}
return content
elif span['type'] == ContentType.Table:
content = {
'type': 'table',
'img_path': join_path(s3_image_save_path, span['image_path'])
'img_path': span['image_path']
}
return content
else:
......
......@@ -10,10 +10,10 @@ class AbsReaderWriter(ABC):
def __init__(self, parent_path):
# 初始化代码可以在这里添加,如果需要的话
self.parent_path = parent_path # 对于本地目录是父目录,对于s3是会写到这个apth下。
self.parent_path = parent_path # 对于本地目录是父目录,对于s3是会写到这个path下。
@abstractmethod
def read(self, path: str, mode="text"):
def read(self, path: str, mode=MODE_TXT):
"""
无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
"""
......
import os
from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
from loguru import logger
MODE_TXT = "text"
MODE_BIN = "binary"
class DiskReaderWriter(AbsReaderWriter):
def __init__(self, parent_path, encoding='utf-8'):
self.path = parent_path
self.encoding = encoding
def read(self, mode="text"):
if not os.path.exists(self.path):
logger.error(f"文件 {self.path} 不存在")
raise Exception(f"文件 {self.path} 不存在")
if mode == "text":
with open(self.path, 'r', encoding = self.encoding) as f:
def read(self, path, mode=MODE_TXT):
if os.path.isabs(path):
abspath = path
else:
abspath = os.path.join(self.path, path)
if not os.path.exists(abspath):
logger.error(f"文件 {abspath} 不存在")
raise Exception(f"文件 {abspath} 不存在")
if mode == MODE_TXT:
with open(abspath, 'r', encoding = self.encoding) as f:
return f.read()
elif mode == "binary":
with open(self.path, 'rb') as f:
elif mode == MODE_BIN:
with open(abspath, 'rb') as f:
return f.read()
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def write(self, data, mode="text"):
if mode == "text":
with open(self.path, 'w', encoding=self.encoding) as f:
f.write(data)
logger.info(f"内容已成功写入 {self.path}")
def write(self, content, path, mode=MODE_TXT):
if os.path.isabs(path):
abspath = path
else:
abspath = os.path.join(self.path, path)
if mode == MODE_TXT:
with open(abspath, 'w', encoding=self.encoding) as f:
f.write(content)
logger.info(f"内容已成功写入 {abspath}")
elif mode == "binary":
with open(self.path, 'wb') as f:
f.write(data)
logger.info(f"内容已成功写入 {self.path}")
elif mode == MODE_BIN:
with open(abspath, 'wb') as f:
f.write(content)
logger.info(f"内容已成功写入 {abspath}")
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding='utf-8'):
return self.read(path)
# 使用示例
if __name__ == "__main__":
file_path = "example.txt"
drw = DiskReaderWriter(file_path)
file_path = "io/example.txt"
drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
# 写入内容到文件
drw.write(b"Hello, World!", mode="binary")
drw.write(b"Hello, World!", path="io/example.txt", mode="binary")
# 从文件读取内容
content = drw.read()
content = drw.read(path=file_path)
if content:
logger.info(f"从 {file_path} 读取的内容: {content}")
......
from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key
import boto3
from loguru import logger
from boto3.s3.transfer import TransferConfig
from botocore.config import Config
import os
MODE_TXT = "text"
MODE_BIN = "binary"
class S3ReaderWriter(AbsReaderWriter):
def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str, parent_path: str):
self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
self.path = parent_path
def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
s3_client = boto3.client(
......@@ -22,51 +25,83 @@ class S3ReaderWriter(AbsReaderWriter):
retries={'max_attempts': 5, 'mode': 'standard'}),
)
return s3_client
def read(self, s3_path, mode="text", encoding="utf-8"):
bucket_name, bucket_key = parse_bucket_key(s3_path)
res = self.client.get_object(Bucket=bucket_name, Key=bucket_key)
def read(self, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
if s3_relative_path.startswith("s3://"):
s3_path = s3_relative_path
else:
s3_path = os.path.join(self.path, s3_relative_path)
bucket_name, key = parse_bucket_key(s3_path)
res = self.client.get_object(Bucket=bucket_name, Key=key)
body = res["Body"].read()
if mode == 'text':
if mode == MODE_TXT:
data = body.decode(encoding) # Decode bytes to text
elif mode == 'binary':
elif mode == MODE_BIN:
data = body
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
return data
def write(self, data, s3_path, mode="text", encoding="utf-8"):
if mode == 'text':
body = data.encode(encoding) # Encode text data as bytes
elif mode == 'binary':
body = data
def write(self, content, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
if s3_relative_path.startswith("s3://"):
s3_path = s3_relative_path
else:
s3_path = os.path.join(self.path, s3_relative_path)
if mode == MODE_TXT:
body = content.encode(encoding) # Encode text data as bytes
elif mode == MODE_BIN:
body = content
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
bucket_name, bucket_key = parse_bucket_key(s3_path)
self.client.put_object(Body=body, Bucket=bucket_name, Key=bucket_key)
bucket_name, key = parse_bucket_key(s3_path)
self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
logger.info(f"内容已写入 {s3_path} ")
def read_jsonl(self, path: str, byte_start=0, byte_end=None, mode=MODE_TXT, encoding='utf-8'):
if path.startswith("s3://"):
s3_path = path
else:
s3_path = os.path.join(self.path, path)
bucket_name, key = parse_bucket_key(s3_path)
range_header = f'bytes={byte_start}-{byte_end}' if byte_end else f'bytes={byte_start}-'
res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
body = res["Body"].read()
if mode == MODE_TXT:
data = body.decode(encoding) # Decode bytes to text
elif mode == MODE_BIN:
data = body
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
return data
if __name__ == "__main__":
# Config the connection info
ak = ""
sk = ""
endpoint_url = ""
addressing_style = ""
addressing_style = "auto"
bucket_name = ""
# Create an S3ReaderWriter object
s3_reader_writer = S3ReaderWriter(ak, sk, endpoint_url, addressing_style)
s3_reader_writer = S3ReaderWriter(ak, sk, endpoint_url, addressing_style, "s3://bucket_name/")
# Write text data to S3
text_data = "This is some text data"
s3_reader_writer.write(data=text_data, s3_path = "s3://bucket_name/ebook/test/test.json", mode='text')
s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT)
# Read text data from S3
text_data_read = s3_reader_writer.read(s3_path = "s3://bucket_name/ebook/test/test.json", mode='text')
text_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT)
logger.info(f"Read text data from S3: {text_data_read}")
# Write binary data to S3
binary_data = b"This is some binary data"
s3_reader_writer.write(data=text_data, s3_path = "s3://bucket_name/ebook/test/test2.json", mode='binary')
s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN)
# Read binary data from S3
binary_data_read = s3_reader_writer.read(s3_path = "s3://bucket_name/ebook/test/test2.json", mode='binary')
binary_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN)
logger.info(f"Read binary data from S3: {binary_data_read}")
# Range Read text data from S3
binary_data_read = s3_reader_writer.read_jsonl(path=f"s3://{bucket_name}/ebook/test/test.json",
byte_start=0, byte_end=10, mode=MODE_BIN)
logger.info(f"Read binary data from S3: {binary_data_read}")
......@@ -24,7 +24,7 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
json_dump_path = "s3://llm-pdf-text/json_dump/"
s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # TODO 基础库不应该有这些存在的路径,应该在业务代码中定义
# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
def get_top_percent_list(num_list, percent):
......@@ -120,28 +120,8 @@ def read_file(pdf_path: str, s3_profile):
return f.read()
def get_docx_model_output(pdf_model_output, pdf_model_s3_profile, page_id):
if isinstance(pdf_model_output, str):
model_output_json_path = join_path(pdf_model_output, f"page_{page_id + 1}.json") # 模型输出的页面编号从1开始的
if os.path.exists(model_output_json_path):
json_from_docx = read_file(model_output_json_path, pdf_model_s3_profile)
model_output_json = json.loads(json_from_docx)
else:
try:
model_output_json_path = join_path(pdf_model_output, "model.json")
with open(model_output_json_path, "r", encoding="utf-8") as f:
model_output_json = json.load(f)
model_output_json = model_output_json["doc_layout_result"][page_id]
except:
s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id + 1}.json")
s3_model_output_json_path = join_path(pdf_model_output, f"{page_id}.json")
#s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id }.json")
# logger.warning(f"model_output_json_path: {model_output_json_path} not found. try to load from s3: {s3_model_output_json_path}")
s = read_file(s3_model_output_json_path, pdf_model_s3_profile)
return json.loads(s)
elif isinstance(pdf_model_output, list):
def get_docx_model_output(pdf_model_output, page_id):
model_output_json = pdf_model_output[page_id]
return model_output_json
......
import hashlib
def compute_md5(file_bytes):
hasher = hashlib.md5()
hasher.update(file_bytes)
return hasher.hexdigest().upper()
def compute_sha256(input_string):
hasher = hashlib.sha256()
# 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
input_bytes = input_string.encode('utf-8')
hasher.update(input_bytes)
return hasher.hexdigest()
import os
from pathlib import Path
from typing import Tuple
import io
# from app.common.s3 import get_s3_client
from magic_pdf.libs.commons import fitz
from loguru import logger
from magic_pdf.libs.commons import parse_bucket_key, join_path
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.hash_utils import compute_sha256
def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_return_path=None, img_s3_client=None, upload_switch=True):
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter):
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
"""
# 拼接文件名
filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}.jpg"
# 拼接路径
image_save_path = join_path(save_parent_path, filename)
s3_img_path = join_path(s3_return_path, filename) if s3_return_path is not None else None
# 打印图片文件名
# print(f"Saved {image_save_path}")
#检查坐标
# x_check = int(bbox[2]) - int(bbox[0])
# y_check = int(bbox[3]) - int(bbox[1])
# if x_check <= 0 or y_check <= 0:
#
# if image_save_path.startswith("s3://"):
# logger.exception(f"传入图片坐标有误,x1<x0或y1<y0,{s3_img_path}")
# return s3_img_path
# else:
# logger.exception(f"传入图片坐标有误,x1<x0或y1<y0,{image_save_path}")
# return image_save_path
filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
# 老版本返回不带bucket的路径
img_path = join_path(return_path, filename) if return_path is not None else None
# 新版本生成平铺路径
img_hash256_path = f"{compute_sha256(img_path)}.jpg"
# 将坐标转换为fitz.Rect对象
rect = fitz.Rect(*bbox)
......@@ -42,39 +26,17 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
# 截取图片
pix = page.get_pixmap(clip=rect, matrix=zoom)
if image_save_path.startswith("s3://"):
if not upload_switch:
pass
else:
# 图片保存到s3
bucket_name, bucket_key = parse_bucket_key(image_save_path)
# 将字节流上传到s3
byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
file_obj = io.BytesIO(byte_data)
if img_s3_client is not None:
img_s3_client.upload_fileobj(file_obj, bucket_name, bucket_key)
# 每个图片上传任务都创建一个新的client
# img_s3_client_once = get_s3_client(image_save_path)
# img_s3_client_once.upload_fileobj(file_obj, bucket_name, bucket_key)
else:
logger.exception("must input img_s3_client")
return s3_img_path
else:
# 保存图片到本地
# 先检查一下image_save_path的父目录是否存在,如果不存在,就创建
parent_dir = os.path.dirname(image_save_path)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
pix.save(image_save_path, jpg_quality=95)
# 为了直接能在markdown里看,这里把地址改为相对于mardown的地址
pth = Path(image_save_path)
image_save_path = f"{pth.parent.name}/{pth.name}"
return image_save_path
def save_images_by_bboxes(book_name: str, page_num: int, page: fitz.Page, save_path: str,
image_bboxes: list, images_overlap_backup:list, table_bboxes: list, equation_inline_bboxes: list,
equation_interline_bboxes: list, img_s3_client) -> dict:
imageWriter.write(data=byte_data, path=img_hash256_path, mode="binary")
return img_hash256_path
def save_images_by_bboxes(page_num: int, page: fitz.Page, pdf_bytes_md5: str,
image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
equation_inline_bboxes: list,
equation_interline_bboxes: list, imageWriter) -> dict:
"""
返回一个dict, key为bbox, 值是图片地址
"""
......@@ -85,53 +47,30 @@ def save_images_by_bboxes(book_name: str, page_num: int, page: fitz.Page, save_p
interline_eq_info = []
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
s3_return_image_path = join_path(book_name, "images")
image_save_path = join_path(save_path, s3_return_image_path)
s3_return_table_path = join_path(book_name, "tables")
table_save_path = join_path(save_path, s3_return_table_path)
s3_return_equations_inline_path = join_path(book_name, "equations_inline")
equation_inline_save_path = join_path(save_path, s3_return_equations_inline_path)
s3_return_equation_interline_path = join_path(book_name, "equation_interline")
equation_interline_save_path = join_path(save_path, s3_return_equation_interline_path)
def return_path(type):
return join_path(pdf_bytes_md5, type)
for bbox in image_bboxes:
if any([bbox[0]>=bbox[2], bbox[1]>=bbox[3]]):
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"image_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, image_save_path, s3_return_image_path, img_s3_client)
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_info.append({"bbox": bbox, "image_path": image_path})
for bbox in images_overlap_backup:
if any([bbox[0]>=bbox[2], bbox[1]>=bbox[3]]):
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"images_overlap_backup: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, image_save_path, s3_return_image_path, img_s3_client)
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_backup_info.append({"bbox": bbox, "image_path": image_path})
for bbox in table_bboxes:
if any([bbox[0]>=bbox[2], bbox[1]>=bbox[3]]):
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"table_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, table_save_path, s3_return_table_path, img_s3_client)
image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
table_info.append({"bbox": bbox, "image_path": image_path})
for bbox in equation_inline_bboxes:
if any([bbox[0]>=bbox[2], bbox[1]>=bbox[3]]):
logger.warning(f"equation_inline_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox[:4], page_num, page, equation_inline_save_path, s3_return_equations_inline_path, img_s3_client, upload_switch=False)
inline_eq_info.append({'bbox':bbox[:4], "image_path":image_path, "latex_text":bbox[4]})
for bbox in equation_interline_bboxes:
if any([bbox[0]>=bbox[2], bbox[1]>=bbox[3]]):
logger.warning(f"equation_interline_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox[:4], page_num, page, equation_interline_save_path, s3_return_equation_interline_path, img_s3_client, upload_switch=False)
interline_eq_info.append({"bbox":bbox[:4], "image_path":image_path, "latex_text":bbox[4]})
return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
\ No newline at end of file
import json
import os
import time
from loguru import logger
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_text_bbox
from magic_pdf.libs.commons import (
read_file,
join_path,
fitz,
get_img_s3_client,
get_delta_time,
get_docx_model_output,
)
from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.safe_filename import sanitize_filename
from magic_pdf.para.para_split import para_split
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
......@@ -38,38 +30,16 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
def parse_pdf_by_ocr(
pdf_bytes,
pdf_model_output,
save_path,
book_name,
pdf_model_profile=None,
image_s3_config=None,
imageWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
):
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
book_name = sanitize_filename(book_name)
md_bookname_save_path = ""
if debug_mode:
save_path = join_path(save_tmp_path, "md")
pdf_local_path = join_path(save_tmp_path, "download-pdfs", book_name)
if not os.path.exists(os.path.dirname(pdf_local_path)):
# 如果目录不存在,创建它
os.makedirs(os.path.dirname(pdf_local_path))
md_bookname_save_path = join_path(save_tmp_path, "md", book_name)
if not os.path.exists(md_bookname_save_path):
# 如果目录不存在,创建它
os.makedirs(md_bookname_save_path)
with open(pdf_local_path + ".pdf", "wb") as pdf_file:
pdf_file.write(pdf_bytes)
pdf_bytes_md5 = compute_md5(pdf_bytes)
pdf_docs = fitz.open("pdf", pdf_bytes)
# 初始化空的pdf_info_dict
pdf_info_dict = {}
img_s3_client = get_img_s3_client(save_path, image_s3_config)
start_time = time.time()
......@@ -91,16 +61,14 @@ def parse_pdf_by_ocr(
# 获取当前页的模型数据
ocr_page_info = get_docx_model_output(
pdf_model_output, pdf_model_profile, page_id
pdf_model_output, page_id
)
"""从json中获取每页的页码、页眉、页脚的bbox"""
page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info)
header_bboxes = parse_headers(page_id, page, ocr_page_info)
footer_bboxes = parse_footers(page_id, page, ocr_page_info)
footnote_bboxes = parse_footnotes_by_model(
page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode
)
footnote_bboxes = parse_footnotes_by_model(page_id, page, ocr_page_info, debug_mode=debug_mode)
# 构建需要remove的bbox字典
need_remove_spans_bboxes_dict = {
......@@ -179,7 +147,7 @@ def parse_pdf_by_ocr(
spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
'''对image和table截图'''
spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
spans = cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list = []
......@@ -242,16 +210,4 @@ def parse_pdf_by_ocr(
"""分段"""
para_split(pdf_info_dict, debug_mode=debug_mode)
'''在测试时,保存调试信息'''
if debug_mode:
params_file_save_path = join_path(
save_tmp_path, "md", book_name, "preproc_out.json"
)
with open(params_file_save_path, "w", encoding="utf-8") as f:
json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
# drow_bbox
draw_layout_bbox(pdf_info_dict, pdf_bytes, md_bookname_save_path)
draw_text_bbox(pdf_info_dict, pdf_bytes, md_bookname_save_path)
return pdf_info_dict
This diff is collapsed.
......@@ -112,7 +112,6 @@ def parse_pdf_for_train(
pdf_model_output,
save_path,
book_name,
pdf_model_profile=None,
image_s3_config=None,
start_page_id=0,
end_page_id=None,
......@@ -200,7 +199,7 @@ def parse_pdf_for_train(
flags=fitz.TEXTFLAGS_TEXT,
)["blocks"]
model_output_json = get_docx_model_output(
pdf_model_output, pdf_model_profile, page_id
pdf_model_output, page_id
)
# 解析图片
......
......@@ -3,7 +3,7 @@ from magic_pdf.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.coordinate_transform import get_scale_ratio
def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False):
def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path=None, debug_mode=False):
"""
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
:param page :fitz读取的当前页的内容
......
......@@ -3,18 +3,16 @@ from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.pdf_image_tools import cut_image
def cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client):
def s3_return_path(type):
return join_path(book_name, type)
def cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
def img_save_path(type):
return join_path(save_path, s3_return_path(type))
def return_path(type):
return join_path(pdf_bytes_md5, type)
for span in spans:
span_type = span['type']
if span_type == ContentType.Image:
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'), s3_return_path=s3_return_path('images'), img_s3_client=img_s3_client)
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'), imageWriter=imageWriter)
elif span_type == ContentType.Table:
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'), s3_return_path=s3_return_path('tables'), img_s3_client=img_s3_client)
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'), imageWriter=imageWriter)
return spans
......@@ -12,27 +12,78 @@
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
from loguru import logger
from magic_pdf.io import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
def parse_txt_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
"""
解析文本类pdf
"""
pass
pdf_info_dict = parse_pdf_by_txt(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
pdf_info_dict["parse_type"] = "txt"
return pdf_info_dict
def parse_ocr_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
"""
解析ocr类pdf
"""
pass
pdf_info_dict = parse_pdf_by_ocr(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
pdf_info_dict["parse_type"] = "ocr"
return pdf_info_dict
def parse_union_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
"""
ocr和文本混合的pdf,全部解析出来
"""
def parse_pdf(method):
try:
return method(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
except Exception as e:
logger.error(f"{method.__name__} error: {e}")
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get("need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
else:
pdf_info_dict["parse_type"] = "ocr"
else:
pdf_info_dict["parse_type"] = "txt"
return pdf_info_dict
def spark_json_extractor(jso:dict):
pass
......@@ -867,6 +867,7 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
save_results(result_dict, overall_report_dict,badcase_file,overall_file)
result=compare_edit_distance(base_data_path, overall_report_dict)
<<<<<<< HEAD
if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
try:
......@@ -874,7 +875,10 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
except Exception as e:
print(f"上传到S3时发生错误: {e}")
=======
>>>>>>> ff8f62aa3c28facc192104387f131d87978064fc
print(result)
assert result == 1
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="主函数,执行整个评估流程。")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment