Commit 698c4a83 authored by kernel.h@qq.com's avatar kernel.h@qq.com

修改pipe模块

parent ffc20db7
......@@ -62,13 +62,13 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
if parse_method == "ocr":
jso_useful_key["_pdf_type"] = "ocr"
pdf_mid_data = uni_pipe.pipe_parse()
md_content = UNIPipe.mk_markdown(pdf_mid_data, image_dir)
uni_pipe.pipe_parse()
md_content = uni_pipe.pipe_mk_markdown()
part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer.write(content=md_content, path=f"{part_file_name}.md", mode=MODE_TXT)
md_writer.write(
content=json_parse.dumps(
JsonCompressor.decompress_json(pdf_mid_data), ensure_ascii=False, indent=4
uni_pipe.pdf_mid_data, ensure_ascii=False, indent=4
),
path=f"{part_file_name}.json",
mode=MODE_TXT,
......
......@@ -589,6 +589,8 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
3. 参照上述行尾特征进行分段。
4. 图、表,目前独占一行,不考虑分段。
"""
if page_num==343:
pass
lines_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段
layout_paras, layout_list_info = __split_para_in_layoutbox(lines_group, new_layout_bbox, lang) # layout内分段
layout_paras2, page_list_info = __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, page_num, lang) # layout之间连接列表段落
......
......@@ -13,11 +13,18 @@ class AbsPipe(ABC):
"""
txt和ocr处理的抽象类
"""
PIP_OCR = "ocr"
PIP_TXT = "txt"
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path:str, ):
self.pdf_bytes = pdf_bytes
self.model_list = model_list
self.image_writer = image_writer
self.img_parent_path = img_parent_path
self.pdf_mid_data = None # 未压缩
def get_compress_pdf_mid_data(self):
return JsonCompressor.compress_json(self.pdf_mid_data)
@abstractmethod
def pipe_classify(self):
......
......@@ -6,25 +6,19 @@ from magic_pdf.user_api import parse_ocr_pdf
class OCRPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_bucket_path: str):
self.compressed_pdf_mid_data = None
self.pdf_mid_data = None
self.pdf_bytes = pdf_bytes
self.model_list = model_list
self.image_writer = image_writer
self.img_bucket_path = img_bucket_path
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
def pipe_classify(self):
pass
def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer)
self.compressed_pdf_mid_data = JsonCompressor.compress_json(self.pdf_mid_data)
def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.compressed_pdf_mid_data, self.img_bucket_path)
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
return content_list
def pipe_mk_markdown(self):
md_content = AbsPipe.mk_markdown(self.compressed_pdf_mid_data, self.img_bucket_path)
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
return md_content
......@@ -6,25 +6,19 @@ from magic_pdf.user_api import parse_txt_pdf
class TXTPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_bucket_path: str):
self.compressed_pdf_mid_data = None
self.pdf_mid_data = None
self.pdf_bytes = pdf_bytes
self.model_list = model_list
self.image_writer = image_writer
self.img_bucket_path = img_bucket_path
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
def pipe_classify(self):
pass
def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer)
self.compressed_pdf_mid_data = JsonCompressor.compress_json(self.pdf_mid_data)
def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.compressed_pdf_mid_data, self.img_bucket_path)
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
return content_list
def pipe_mk_markdown(self):
md_content = AbsPipe.mk_markdown(self.compressed_pdf_mid_data, self.img_bucket_path)
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
return md_content
......@@ -15,31 +15,25 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class UNIPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_bucket_path: str):
self.pdf_type = "ocr"
self.compressed_pdf_mid_data = None
self.pdf_mid_data = None
self.pdf_bytes = pdf_bytes
self.model_list = model_list
self.image_writer = image_writer
self.img_bucket_path = img_bucket_path
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
self.pdf_type = self.PIP_OCR
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
def pipe_classify(self):
self.pdf_type = UNIPipe.classify(self.pdf_bytes)
def pipe_parse(self):
if self.pdf_type == "txt":
if self.pdf_type == self.PIP_TXT:
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer)
elif self.pdf_type == "ocr":
elif self.pdf_type == self.PIP_OCR:
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer)
self.compressed_pdf_mid_data = JsonCompressor.compress_json(self.pdf_mid_data)
def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.compressed_pdf_mid_data, self.img_bucket_path)
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
return content_list
def pipe_mk_markdown(self):
markdown_content = AbsPipe.mk_markdown(self.compressed_pdf_mid_data, self.img_bucket_path)
markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
return markdown_content
if __name__ == '__main__':
......
......@@ -41,12 +41,10 @@ class DiskReaderWriter(AbsReaderWriter):
if mode == MODE_TXT:
with open(abspath, "w", encoding=self.encoding) as f:
f.write(content)
logger.info(f"内容已成功写入 {abspath}")
elif mode == MODE_BIN:
with open(abspath, "wb") as f:
f.write(content)
logger.info(f"内容已成功写入 {abspath}")
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment