Commit f8f6ba6f authored by myhloli's avatar myhloli

update:Add md make mode config in do_parse.You can control whether the...

update:Add md make mode config in do_parse.You can control whether the produced md is for NLP or MM by changing the value of f_make_md_mode
parent c5f939c5
...@@ -28,7 +28,7 @@ from loguru import logger ...@@ -28,7 +28,7 @@ from loguru import logger
from pathlib import Path from pathlib import Path
from magic_pdf.libs.version import __version__ from magic_pdf.libs.version import __version__
from magic_pdf.libs.MakeContentConfig import DropMode from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.pipe.OCRPipe import OCRPipe
...@@ -81,6 +81,7 @@ def do_parse( ...@@ -81,6 +81,7 @@ def do_parse(
f_dump_model_json=True, f_dump_model_json=True,
f_dump_orig_pdf=True, f_dump_orig_pdf=True,
f_dump_content_list=True, f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
): ):
orig_model_list = copy.deepcopy(model_list) orig_model_list = copy.deepcopy(model_list)
...@@ -118,7 +119,7 @@ def do_parse( ...@@ -118,7 +119,7 @@ def do_parse(
if f_draw_span_bbox: if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir) draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE) md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
if f_dump_md: if f_dump_md:
"""写markdown""" """写markdown"""
md_writer.write( md_writer.write(
......
...@@ -47,19 +47,13 @@ class AbsPipe(ABC): ...@@ -47,19 +47,13 @@ class AbsPipe(ABC):
""" """
raise NotImplementedError raise NotImplementedError
@abstractmethod def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
def pipe_mk_uni_format(self, img_parent_path, drop_mode): content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
""" return content_list
有状态的组装统一格式
"""
raise NotImplementedError
@abstractmethod def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
def pipe_mk_markdown(self, img_parent_path, drop_mode): md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
""" return md_content
有状态的组装markdown
"""
raise NotImplementedError
@staticmethod @staticmethod
def classify(pdf_bytes: bytes) -> str: def classify(pdf_bytes: bytes) -> str:
...@@ -101,13 +95,13 @@ class AbsPipe(ABC): ...@@ -101,13 +95,13 @@ class AbsPipe(ABC):
return content_list return content_list
@staticmethod @staticmethod
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list: def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
""" """
根据pdf类型,markdown 根据pdf类型,markdown
""" """
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data) pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
pdf_info_list = pdf_mid_data["pdf_info"] pdf_info_list = pdf_mid_data["pdf_info"]
md_content = union_make(pdf_info_list, MakeMode.MM_MD, drop_mode, img_buket_path) md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
return md_content return md_content
from magic_pdf.libs.MakeContentConfig import DropMode from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.pipe.AbsPipe import AbsPipe from magic_pdf.pipe.AbsPipe import AbsPipe
...@@ -7,7 +9,7 @@ from magic_pdf.user_api import parse_ocr_pdf ...@@ -7,7 +9,7 @@ from magic_pdf.user_api import parse_ocr_pdf
class OCRPipe(AbsPipe): class OCRPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False): def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
super().__init__(pdf_bytes, model_list, image_writer, is_debug) super().__init__(pdf_bytes, model_list, image_writer, is_debug)
def pipe_classify(self): def pipe_classify(self):
...@@ -20,9 +22,11 @@ class OCRPipe(AbsPipe): ...@@ -20,9 +22,11 @@ class OCRPipe(AbsPipe):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug) self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode) result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
return content_list logger.info("ocr_pipe mk content list finished")
return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode) def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
return md_content result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
logger.info(f"ocr_pipe mk {md_make_mode} finished")
return result
from magic_pdf.libs.MakeContentConfig import DropMode from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor from magic_pdf.libs.json_compressor import JsonCompressor
...@@ -8,7 +10,7 @@ from magic_pdf.user_api import parse_txt_pdf ...@@ -8,7 +10,7 @@ from magic_pdf.user_api import parse_txt_pdf
class TXTPipe(AbsPipe): class TXTPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False): def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
super().__init__(pdf_bytes, model_list, image_writer, is_debug) super().__init__(pdf_bytes, model_list, image_writer, is_debug)
def pipe_classify(self): def pipe_classify(self):
...@@ -21,9 +23,11 @@ class TXTPipe(AbsPipe): ...@@ -21,9 +23,11 @@ class TXTPipe(AbsPipe):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug) self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode) result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
return content_list logger.info("txt_pipe mk content list finished")
return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode) def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
return md_content result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
logger.info(f"txt_pipe mk {md_make_mode} finished")
return result
...@@ -2,7 +2,7 @@ import json ...@@ -2,7 +2,7 @@ import json
from loguru import logger from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
...@@ -39,12 +39,14 @@ class UNIPipe(AbsPipe): ...@@ -39,12 +39,14 @@ class UNIPipe(AbsPipe):
is_debug=self.is_debug) is_debug=self.is_debug)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode) result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
return content_list logger.info("uni_pipe mk content list finished")
return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode) def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
return markdown_content result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
logger.info(f"uni_pipe mk {md_make_mode} finished")
return result
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment