Commit 709a6500 authored by 赵小蒙's avatar 赵小蒙

中间态dict结构调整

部分函数重构
parent 969f08dd
......@@ -15,7 +15,7 @@ from loguru import logger
from magic_pdf.libs.config_reader import get_s3_config_dict
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
from magic_pdf.spark.base import get_data_source
from magic_pdf.spark.spark_api import get_data_source
def demo_parse_pdf(book_name=None, start_page_id=0, debug_mode=True):
......
......@@ -228,12 +228,12 @@ def __insert_before_para(text, type, element, content_list):
logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
def mk_universal_format(para_dict: dict, img_buket_path):
def mk_universal_format(pdf_info_list: list, img_buket_path):
"""
构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
"""
content_lst = []
for _, page_info in para_dict.items():
for page_info in pdf_info_list:
page_lst = [] # 一个page内的段落列表
para_blocks = page_info.get("para_blocks")
pymu_raw_blocks = page_info.get("preproc_blocks")
......
......@@ -69,11 +69,11 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
return '\n'.join(markdown)
def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
markdown = []
for _, page_info in pdf_info_dict.items():
for page_info in pdf_info_list:
paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "mm")
page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "mm", img_buket_path)
markdown.extend(page_markdown)
return '\n\n'.join(markdown)
......@@ -100,7 +100,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
return markdown_with_para_and_pagination
def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path):
page_markdown = []
for paras in paras_of_layout:
for para in paras:
......@@ -123,7 +123,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]:
if mode == 'mm':
content = f"\n![]({span['image_path']})\n"
content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
elif mode == 'nlp':
pass
if content != '':
......
def dict_to_list(input_dict):
items_list = []
for _, item in input_dict.items():
items_list.append(item)
return items_list
......@@ -28,7 +28,7 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
imageWriter.write(data=byte_data, path=img_hash256_path, mode="binary")
imageWriter.write(content=byte_data, path=img_hash256_path, mode="binary")
return img_hash256_path
......
......@@ -5,6 +5,7 @@ from magic_pdf.libs.commons import (
get_delta_time,
get_docx_model_output,
)
from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.hash_utils import compute_md5
......@@ -210,4 +211,10 @@ def parse_pdf_by_ocr(
"""分段"""
para_split(pdf_info_dict, debug_mode=debug_mode)
return pdf_info_dict
"""dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict)
new_pdf_info_dict = {
"pdf_info": pdf_info_list,
}
return new_pdf_info_dict
......@@ -11,6 +11,7 @@ from magic_pdf.layout.bbox_sort import (
prepare_bboxes_for_layout_split,
)
from magic_pdf.layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block
from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.markdown_utils import escape_special_markdown_char
......@@ -400,4 +401,11 @@ def parse_pdf_by_txt(
if error_info is not None:
return _deal_with_text_exception(error_info)
return pdf_info_dict
"""dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict)
new_pdf_info_dict = {
"pdf_info": pdf_info_list,
}
return new_pdf_info_dict
import json
from loguru import logger
from magic_pdf.dict2md.mkcontent import mk_universal_format
from magic_pdf.dict2md.ocr_mkcontent import make_standard_format_with_para
from magic_pdf.dict2md.mkcontent import mk_universal_format, mk_mm_markdown
from magic_pdf.dict2md.ocr_mkcontent import make_standard_format_with_para, ocr_mk_mm_markdown_with_para
from magic_pdf.filter.pdf_classify_by_type import classify
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
from magic_pdf.io.DiskReaderWriter import DiskReaderWriter
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.detect_language_from_model import get_language_from_model
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.spark.spark_api import parse_union_pdf, parse_ocr_pdf
from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class UNIPipe:
def __init__(self):
pass
def classify(self, pdf_bytes: bytes) -> str:
@staticmethod
def classify(pdf_bytes: bytes) -> str:
"""
根据pdf的元数据,判断是否是文本pdf,还是ocr pdf
"""
......@@ -57,25 +63,64 @@ class UNIPipe:
pdf_mid_data = parse_ocr_pdf(pdf_bytes, jso_useful_key['model_list'], image_writer)
else:
raise Exception(f"pdf type is not txt or ocr")
return JsonCompressor.compress(pdf_mid_data)
return JsonCompressor.compress_json(pdf_mid_data)
def mk_uni_format(self, pdf_mid_data: str, img_buket_path: str) -> list:
@staticmethod
def mk_uni_format(pdf_mid_data: str, img_buket_path: str) -> list:
"""
根据pdf类型,生成统一格式content_list
"""
pdf_mid_data = JsonCompressor.decompress_json(pdf_mid_data)
parse_type = pdf_mid_data["_parse_type"]
pdf_info_list = pdf_mid_data["pdf_info"]
if parse_type == "txt":
content_list = mk_universal_format(pdf_mid_data, img_buket_path)
content_list = mk_universal_format(pdf_info_list, img_buket_path)
elif parse_type == "ocr":
content_list = make_standard_format_with_para(pdf_mid_data, img_buket_path)
content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
return content_list
@staticmethod
def mk_markdown(pdf_mid_data: str, img_buket_path: str) -> list:
"""
根据pdf类型,markdown
"""
pdf_mid_data = JsonCompressor.decompress_json(pdf_mid_data)
parse_type = pdf_mid_data["_parse_type"]
pdf_info_list = pdf_mid_data["pdf_info"]
if parse_type == "txt":
content_list = mk_universal_format(pdf_info_list, img_buket_path)
md_content = mk_mm_markdown(content_list)
elif parse_type == "ocr":
md_content = ocr_mk_mm_markdown_with_para(pdf_info_list, img_buket_path)
return md_content
if __name__ == '__main__':
# 测试
pipe = UNIPipe()
pdf_bytes = open(r"D:\project\20231108code-clean\magic_pdf\tmp\unittest\download-pdfs\数学新星网\edu_00001544.pdf",
"rb").read()
pdf_type = pipe.classify(pdf_bytes)
# file_path = r"tmp/unittest/download-pdfs/数学新星网/edu_00001236.pdf"
drw = DiskReaderWriter(r"D:/project/20231108code-clean")
# pdf_bytes = drw.read(path=file_path, mode=AbsReaderWriter.MODE_BIN)
# pdf_type = UNIPipe.classify(pdf_bytes)
# logger.info(f"pdf_type is {pdf_type}")
pdf_file_path = r"linshixuqiu\25536-00.pdf"
model_file_path = r"linshixuqiu\25536-00.json"
pdf_bytes = drw.read(path=pdf_file_path, mode=AbsReaderWriter.MODE_BIN)
model_json_txt = drw.read(path=model_file_path, mode=AbsReaderWriter.MODE_TXT)
pdf_type = UNIPipe.classify(pdf_bytes)
logger.info(f"pdf_type is {pdf_type}")
jso_useful_key = {
"_pdf_type": pdf_type,
"model_list": json.loads(model_json_txt),
}
pipe = UNIPipe()
write_path = r"D:\project\20231108code-clean\linshixuqiu\25536-00"
img_buket_path = "imgs"
img_writer = DiskReaderWriter(join_path(write_path, img_buket_path))
pdf_mid_data = pipe.parse(pdf_bytes, img_writer, jso_useful_key)
md_content = pipe.mk_markdown(pdf_mid_data, "imgs")
md_writer = DiskReaderWriter(write_path)
md_writer.write(content=md_content, path="25536-00.md", mode=AbsReaderWriter.MODE_TXT)
md_writer.write(content=json.dumps(JsonCompressor.decompress_json(pdf_mid_data), ensure_ascii=False, indent=4), path="25536-00.json", mode=AbsReaderWriter.MODE_TXT)
from loguru import logger
from magic_pdf.libs.drop_reason import DropReason
from loguru import logger
from magic_pdf.libs.drop_reason import DropReason
def get_data_source(jso: dict):
data_source = jso.get("data_source")
if data_source is None:
data_source = jso.get("file_source")
return data_source
def get_data_type(jso: dict):
data_type = jso.get("data_type")
if data_type is None:
data_type = jso.get("file_type")
return data_type
def get_bookid(jso: dict):
book_id = jso.get("bookid")
if book_id is None:
book_id = jso.get("original_file_id")
return book_id
def exception_handler(jso: dict, e):
logger.exception(e)
jso["_need_drop"] = True
jso["_drop_reason"] = DropReason.Exception
jso["_exception"] = f"ERROR: {e}"
return jso
def get_bookname(jso: dict):
data_source = get_data_source(jso)
file_id = jso.get("file_id")
book_name = f"{data_source}/{file_id}"
return book_name
"""
用户输入:
model数组,每个元素代表一个页面
pdf在s3的路径
截图保存的s3位置
然后:
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
from loguru import logger
from magic_pdf.io import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
from magic_pdf.libs.drop_reason import DropReason
def parse_txt_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
"""
解析文本类pdf
"""
pdf_info_dict = parse_pdf_by_txt(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
def get_data_source(jso: dict):
data_source = jso.get("data_source")
if data_source is None:
data_source = jso.get("file_source")
return data_source
pdf_info_dict["parse_type"] = "txt"
return pdf_info_dict
def get_data_type(jso: dict):
data_type = jso.get("data_type")
if data_type is None:
data_type = jso.get("file_type")
return data_type
def parse_ocr_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
"""
解析ocr类pdf
"""
pdf_info_dict = parse_pdf_by_ocr(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
pdf_info_dict["_parse_type"] = "ocr"
return pdf_info_dict
def get_bookid(jso: dict):
book_id = jso.get("bookid")
if book_id is None:
book_id = jso.get("original_file_id")
return book_id
def parse_union_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
"""
ocr和文本混合的pdf,全部解析出来
"""
def parse_pdf(method):
try:
return method(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
except Exception as e:
logger.error(f"{method.__name__} error: {e}")
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
def exception_handler(jso: dict, e):
logger.exception(e)
jso["_need_drop"] = True
jso["_drop_reason"] = DropReason.Exception
jso["_exception"] = f"ERROR: {e}"
return jso
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
else:
pdf_info_dict["_parse_type"] = "ocr"
else:
pdf_info_dict["_parse_type"] = "txt"
return pdf_info_dict
def get_bookname(jso: dict):
data_source = get_data_source(jso)
file_id = jso.get("file_id")
book_name = f"{data_source}/{file_id}"
return book_name
def spark_json_extractor(jso: dict) -> dict:
......
"""
用户输入:
model数组,每个元素代表一个页面
pdf在s3的路径
截图保存的s3位置
然后:
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
from loguru import logger
from magic_pdf.io import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
**kwargs):
"""
解析文本类pdf
"""
pdf_info_dict = parse_pdf_by_txt(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
pdf_info_dict["parse_type"] = "txt"
return pdf_info_dict
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
**kwargs):
"""
解析ocr类pdf
"""
pdf_info_dict = parse_pdf_by_ocr(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
pdf_info_dict["_parse_type"] = "ocr"
return pdf_info_dict
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
*args, **kwargs):
"""
ocr和文本混合的pdf,全部解析出来
"""
def parse_pdf(method):
try:
return method(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page,
debug_mode=is_debug,
)
except Exception as e:
logger.error(f"{method.__name__} error: {e}")
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
else:
pdf_info_dict["_parse_type"] = "ocr"
else:
pdf_info_dict["_parse_type"] = "txt"
return pdf_info_dict
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment