Commit 00f16239 authored by 赵小蒙's avatar 赵小蒙

实现parse_ocr_pdf api,切图逻辑s3使用平铺地址,本地使用层级地址,删除预设s3_image_save_path

parent cfac3b25
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: Python package
on:
push:
tags:
- '*released'
workflow_dispatch:
jobs:
build:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.10"]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Install wheel
run: |
python -m pip install wheel
- name: Build wheel
run: |
python setup.py bdist_wheel
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: wheel-file
path: dist/*.whl
retention-days: 30
release:
needs: [ build ]
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifact
uses: actions/download-artifact@v4
with:
name: wheel-file
path: dist
- name: Create and Upload Release
id: create_release
uses: softprops/action-gh-release@4634c16e79c963813287e889244c50009e7f0981
with:
files: './dist/*.whl'
env:
GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
# - name: Publish to PyPI
# uses: pypa/gh-action-pypi-publish@release/v1
# with:
# user: __token__
# password: ${{ secrets.PYPI_TOKEN }}
......@@ -116,7 +116,7 @@ if __name__ == '__main__':
pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_local_parse(pdf_path, json_file_path)
book_name = "科数网/edu_00011318"
book_name = "数学新星网/edu_00001236"
ocr_online_parse(book_name)
pass
from magic_pdf.libs.commons import s3_image_save_path, join_path
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.libs.ocr_content_type import ContentType
......@@ -56,7 +55,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
if not span.get('image_path'):
continue
else:
content = f"![]({join_path(s3_image_save_path, span['image_path'])})"
content = f"![]({span['image_path']})"
else:
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
if span['type'] == ContentType.InlineEquation:
......@@ -123,7 +122,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]:
if mode == 'mm':
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
content = f"\n![]({span['image_path']})\n"
elif mode == 'nlp':
pass
if content != '':
......@@ -195,13 +194,13 @@ def line_to_standard_format(line):
if span['type'] == ContentType.Image:
content = {
'type': 'image',
'img_path': join_path(s3_image_save_path, span['image_path'])
'img_path': span['image_path']
}
return content
elif span['type'] == ContentType.Table:
content = {
'type': 'table',
'img_path': join_path(s3_image_save_path, span['image_path'])
'img_path': span['image_path']
}
return content
else:
......
......@@ -24,7 +24,7 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
json_dump_path = "s3://llm-pdf-text/json_dump/"
s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # TODO 基础库不应该有这些存在的路径,应该在业务代码中定义
# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
def get_top_percent_list(num_list, percent):
......
import hashlib
def compute_md5(file_bytes):
hasher = hashlib.md5()
hasher.update(file_bytes)
return hasher.hexdigest().upper()
def compute_sha256(input_string):
hasher = hashlib.sha256()
# 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
input_bytes = input_string.encode('utf-8')
hasher.update(input_bytes)
return hasher.hexdigest()
......@@ -7,6 +7,7 @@ import io
from magic_pdf.libs.commons import fitz
from loguru import logger
from magic_pdf.libs.commons import parse_bucket_key, join_path
from magic_pdf.libs.hash_utils import compute_sha256
def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_return_path=None, img_s3_client=None, upload_switch=True):
......@@ -16,9 +17,13 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
"""
# 拼接文件名
filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}.jpg"
# 拼接路径
image_save_path = join_path(save_parent_path, filename)
# 老版本返回不带bucket的路径
s3_img_path = join_path(s3_return_path, filename) if s3_return_path is not None else None
# 新版本生成s3的平铺路径
s3_img_hash256_path = f"{compute_sha256(s3_img_path)}.jpg"
# 打印图片文件名
# print(f"Saved {image_save_path}")
......@@ -42,12 +47,16 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
# 截取图片
pix = page.get_pixmap(clip=rect, matrix=zoom)
if image_save_path.startswith("s3://"):
if save_parent_path.startswith("s3://"):
if not upload_switch:
pass
else:
# 图片保存到s3
bucket_name, bucket_key = parse_bucket_key(image_save_path)
"""图片保存到s3"""
# 从save_parent_path获取bucket_name
bucket_name, bucket_key = parse_bucket_key(save_parent_path)
# 平铺路径赋值给bucket_key
bucket_key = s3_img_hash256_path
# 将字节流上传到s3
byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
file_obj = io.BytesIO(byte_data)
......@@ -58,18 +67,21 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
# img_s3_client_once.upload_fileobj(file_obj, bucket_name, bucket_key)
else:
logger.exception("must input img_s3_client")
return s3_img_path
# return s3_img_path # 早期版本要求返回不带bucket的路径
s3_image_save_path = f"s3://{bucket_name}/{s3_img_hash256_path}" # 新版本返回平铺的s3路径
return s3_image_save_path
else:
# 保存图片到本地
# 先检查一下image_save_path的父目录是否存在,如果不存在,就创建
parent_dir = os.path.dirname(image_save_path)
local_image_save_path = join_path(save_parent_path, filename)
parent_dir = os.path.dirname(local_image_save_path)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
pix.save(image_save_path, jpg_quality=95)
pix.save(local_image_save_path, jpg_quality=95)
# 为了直接能在markdown里看,这里把地址改为相对于mardown的地址
pth = Path(image_save_path)
image_save_path = f"{pth.parent.name}/{pth.name}"
return image_save_path
pth = Path(local_image_save_path)
local_image_save_path = f"{pth.parent.name}/{pth.name}"
return local_image_save_path
def save_images_by_bboxes(book_name: str, page_num: int, page: fitz.Page, save_path: str,
......
......@@ -15,6 +15,7 @@ from magic_pdf.libs.commons import (
)
from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.safe_filename import sanitize_filename
from magic_pdf.para.para_split import para_split
......@@ -39,18 +40,18 @@ def parse_pdf_by_ocr(
pdf_bytes,
pdf_model_output,
save_path,
book_name,
book_name="",
pdf_model_profile=None,
image_s3_config=None,
start_page_id=0,
end_page_id=None,
debug_mode=False,
):
pdf_bytes_md5 = compute_md5(pdf_bytes)
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
book_name = sanitize_filename(book_name)
md_bookname_save_path = ""
if debug_mode:
book_name = sanitize_filename(book_name)
save_path = join_path(save_tmp_path, "md")
pdf_local_path = join_path(save_tmp_path, "download-pdfs", book_name)
......@@ -179,6 +180,8 @@ def parse_pdf_by_ocr(
spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
'''对image和table截图'''
if book_name == "":
book_name = pdf_bytes_md5
spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
......
......@@ -4,6 +4,9 @@ from magic_pdf.libs.pdf_image_tools import cut_image
def cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client):
"""spark环境book_name为pdf_bytes_md5,本地环境会传正常bookname"""
def s3_return_path(type):
return join_path(book_name, type)
......
......@@ -15,6 +15,7 @@
from magic_pdf.io import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
def parse_txt_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
......@@ -28,7 +29,14 @@ def parse_ocr_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWrite
"""
解析ocr类pdf
"""
pass
pdf_info_dict = parse_pdf_by_ocr(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
return pdf_info_dict
def parse_union_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment