Commit 51bb3b36 authored by 赵小蒙's avatar 赵小蒙

cut_image不报错公式图片,增加parse_union_pdf逻辑

parent c45fdcc8
......@@ -5,7 +5,7 @@ from magic_pdf.libs.commons import join_path
from magic_pdf.libs.hash_utils import compute_sha256
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter, upload=True):
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter):
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
......@@ -19,17 +19,16 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
# 新版本生成平铺路径
img_hash256_path = f"{compute_sha256(img_path)}.jpg"
if upload:
# 将坐标转换为fitz.Rect对象
rect = fitz.Rect(*bbox)
# 配置缩放倍数为3倍
zoom = fitz.Matrix(3, 3)
# 截取图片
pix = page.get_pixmap(clip=rect, matrix=zoom)
# 将坐标转换为fitz.Rect对象
rect = fitz.Rect(*bbox)
# 配置缩放倍数为3倍
zoom = fitz.Matrix(3, 3)
# 截取图片
pix = page.get_pixmap(clip=rect, matrix=zoom)
byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
imageWriter.write(data=byte_data, path=img_hash256_path, mode="binary")
imageWriter.write(data=byte_data, path=img_hash256_path, mode="binary")
return img_hash256_path
......@@ -74,18 +73,4 @@ def save_images_by_bboxes(page_num: int, page: fitz.Page, pdf_bytes_md5: str,
image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
table_info.append({"bbox": bbox, "image_path": image_path})
for bbox in equation_inline_bboxes:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"equation_inline_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox[:4], page_num, page, return_path("equations_inline"), imageWriter, upload=False)
inline_eq_info.append({'bbox': bbox[:4], "image_path": image_path, "latex_text": bbox[4]})
for bbox in equation_interline_bboxes:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"equation_interline_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox[:4], page_num, page, return_path("equation_interline"), imageWriter, upload=False)
interline_eq_info.append({"bbox": bbox[:4], "image_path": image_path, "latex_text": bbox[4]})
return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
\ No newline at end of file
......@@ -12,7 +12,7 @@
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
from loguru import logger
from magic_pdf.io import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
......@@ -31,7 +31,6 @@ def parse_txt_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter
debug_mode=is_debug,
)
return pdf_info_dict
pass
def parse_ocr_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
......@@ -52,4 +51,29 @@ def parse_union_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWri
"""
ocr和文本混合的pdf,全部解析出来
"""
pass
\ No newline at end of file
def parse_pdf(method):
try:
return method(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
except Exception as e:
logger.error(f"{method.__name__} error: {e}")
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get("need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
return pdf_info_dict
def spark_json_extractor(jso:dict):
pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment