Commit 698c4a83 authored by kernel.h@qq.com's avatar kernel.h@qq.com

修改pipe模块

parent ffc20db7
...@@ -62,13 +62,13 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag ...@@ -62,13 +62,13 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
if parse_method == "ocr": if parse_method == "ocr":
jso_useful_key["_pdf_type"] = "ocr" jso_useful_key["_pdf_type"] = "ocr"
pdf_mid_data = uni_pipe.pipe_parse() uni_pipe.pipe_parse()
md_content = UNIPipe.mk_markdown(pdf_mid_data, image_dir) md_content = uni_pipe.pipe_mk_markdown()
part_file_name = datetime.now().strftime("%H-%M-%S") part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer.write(content=md_content, path=f"{part_file_name}.md", mode=MODE_TXT) md_writer.write(content=md_content, path=f"{part_file_name}.md", mode=MODE_TXT)
md_writer.write( md_writer.write(
content=json_parse.dumps( content=json_parse.dumps(
JsonCompressor.decompress_json(pdf_mid_data), ensure_ascii=False, indent=4 uni_pipe.pdf_mid_data, ensure_ascii=False, indent=4
), ),
path=f"{part_file_name}.json", path=f"{part_file_name}.json",
mode=MODE_TXT, mode=MODE_TXT,
......
...@@ -589,6 +589,8 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang): ...@@ -589,6 +589,8 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
3. 参照上述行尾特征进行分段。 3. 参照上述行尾特征进行分段。
4. 图、表,目前独占一行,不考虑分段。 4. 图、表,目前独占一行,不考虑分段。
""" """
if page_num==343:
pass
lines_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段 lines_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段
layout_paras, layout_list_info = __split_para_in_layoutbox(lines_group, new_layout_bbox, lang) # layout内分段 layout_paras, layout_list_info = __split_para_in_layoutbox(lines_group, new_layout_bbox, lang) # layout内分段
layout_paras2, page_list_info = __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, page_num, lang) # layout之间连接列表段落 layout_paras2, page_list_info = __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, page_num, lang) # layout之间连接列表段落
......
...@@ -13,11 +13,18 @@ class AbsPipe(ABC): ...@@ -13,11 +13,18 @@ class AbsPipe(ABC):
""" """
txt和ocr处理的抽象类 txt和ocr处理的抽象类
""" """
PIP_OCR = "ocr"
PIP_TXT = "txt"
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter): def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path:str, ):
self.pdf_bytes = pdf_bytes self.pdf_bytes = pdf_bytes
self.model_list = model_list self.model_list = model_list
self.image_writer = image_writer self.image_writer = image_writer
self.img_parent_path = img_parent_path
self.pdf_mid_data = None # 未压缩
def get_compress_pdf_mid_data(self):
return JsonCompressor.compress_json(self.pdf_mid_data)
@abstractmethod @abstractmethod
def pipe_classify(self): def pipe_classify(self):
......
...@@ -6,25 +6,19 @@ from magic_pdf.user_api import parse_ocr_pdf ...@@ -6,25 +6,19 @@ from magic_pdf.user_api import parse_ocr_pdf
class OCRPipe(AbsPipe): class OCRPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_bucket_path: str): def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
self.compressed_pdf_mid_data = None super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
self.pdf_mid_data = None
self.pdf_bytes = pdf_bytes
self.model_list = model_list
self.image_writer = image_writer
self.img_bucket_path = img_bucket_path
def pipe_classify(self): def pipe_classify(self):
pass pass
def pipe_parse(self): def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer) self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer)
self.compressed_pdf_mid_data = JsonCompressor.compress_json(self.pdf_mid_data)
def pipe_mk_uni_format(self): def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.compressed_pdf_mid_data, self.img_bucket_path) content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
return content_list return content_list
def pipe_mk_markdown(self): def pipe_mk_markdown(self):
md_content = AbsPipe.mk_markdown(self.compressed_pdf_mid_data, self.img_bucket_path) md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
return md_content return md_content
...@@ -6,25 +6,19 @@ from magic_pdf.user_api import parse_txt_pdf ...@@ -6,25 +6,19 @@ from magic_pdf.user_api import parse_txt_pdf
class TXTPipe(AbsPipe): class TXTPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_bucket_path: str): def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
self.compressed_pdf_mid_data = None super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
self.pdf_mid_data = None
self.pdf_bytes = pdf_bytes
self.model_list = model_list
self.image_writer = image_writer
self.img_bucket_path = img_bucket_path
def pipe_classify(self): def pipe_classify(self):
pass pass
def pipe_parse(self): def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer) self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer)
self.compressed_pdf_mid_data = JsonCompressor.compress_json(self.pdf_mid_data)
def pipe_mk_uni_format(self): def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.compressed_pdf_mid_data, self.img_bucket_path) content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
return content_list return content_list
def pipe_mk_markdown(self): def pipe_mk_markdown(self):
md_content = AbsPipe.mk_markdown(self.compressed_pdf_mid_data, self.img_bucket_path) md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
return md_content return md_content
...@@ -15,31 +15,25 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf ...@@ -15,31 +15,25 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class UNIPipe(AbsPipe): class UNIPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_bucket_path: str): def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
self.pdf_type = "ocr" self.pdf_type = self.PIP_OCR
self.compressed_pdf_mid_data = None super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
self.pdf_mid_data = None
self.pdf_bytes = pdf_bytes
self.model_list = model_list
self.image_writer = image_writer
self.img_bucket_path = img_bucket_path
def pipe_classify(self): def pipe_classify(self):
self.pdf_type = UNIPipe.classify(self.pdf_bytes) self.pdf_type = UNIPipe.classify(self.pdf_bytes)
def pipe_parse(self): def pipe_parse(self):
if self.pdf_type == "txt": if self.pdf_type == self.PIP_TXT:
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer) self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer)
elif self.pdf_type == "ocr": elif self.pdf_type == self.PIP_OCR:
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer) self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer)
self.compressed_pdf_mid_data = JsonCompressor.compress_json(self.pdf_mid_data)
def pipe_mk_uni_format(self): def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.compressed_pdf_mid_data, self.img_bucket_path) content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
return content_list return content_list
def pipe_mk_markdown(self): def pipe_mk_markdown(self):
markdown_content = AbsPipe.mk_markdown(self.compressed_pdf_mid_data, self.img_bucket_path) markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
return markdown_content return markdown_content
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -41,12 +41,10 @@ class DiskReaderWriter(AbsReaderWriter): ...@@ -41,12 +41,10 @@ class DiskReaderWriter(AbsReaderWriter):
if mode == MODE_TXT: if mode == MODE_TXT:
with open(abspath, "w", encoding=self.encoding) as f: with open(abspath, "w", encoding=self.encoding) as f:
f.write(content) f.write(content)
logger.info(f"内容已成功写入 {abspath}")
elif mode == MODE_BIN: elif mode == MODE_BIN:
with open(abspath, "wb") as f: with open(abspath, "wb") as f:
f.write(content) f.write(content)
logger.info(f"内容已成功写入 {abspath}")
else: else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.") raise ValueError("Invalid mode. Use 'text' or 'binary'.")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment