Commit 5f3cf14a authored by 赵小蒙's avatar 赵小蒙

pipe初始化移除img_parent_path参数

parent a5e22396
......@@ -60,11 +60,11 @@ def prepare_env(pdf_file_name, method):
def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
if parse_method == "auto":
pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
pipe = UNIPipe(pdf_bytes, model_list, image_writer, is_debug=True)
elif parse_method == "txt":
pipe = TXTPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
elif parse_method == "ocr":
pipe = OCRPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
else:
print("unknow parse method")
os.exit(1)
......@@ -74,7 +74,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
pdf_info = pipe.pdf_mid_data['pdf_info']
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
md_content = pipe.pipe_mk_markdown()
md_content = pipe.pipe_mk_markdown(image_dir)
#part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer.write(
content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
......@@ -85,7 +85,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
mode=AbsReaderWriter.MODE_TXT,
)
try:
content_list = pipe.pipe_mk_uni_format()
content_list = pipe.pipe_mk_uni_format(image_dir)
except Exception as e:
logger.exception(e)
md_writer.write(
......
......@@ -16,11 +16,10 @@ class AbsPipe(ABC):
PIP_OCR = "ocr"
PIP_TXT = "txt"
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path:str, is_debug:bool=False):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
self.pdf_bytes = pdf_bytes
self.model_list = model_list
self.image_writer = image_writer
self.img_parent_path = img_parent_path
self.pdf_mid_data = None # 未压缩
self.is_debug = is_debug
......
......@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_ocr_pdf
class OCRPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
super().__init__(pdf_bytes, model_list, image_writer, is_debug)
def pipe_classify(self):
pass
......@@ -15,10 +15,10 @@ class OCRPipe(AbsPipe):
def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
def pipe_mk_uni_format(self, img_parent_path: str):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
return content_list
def pipe_mk_markdown(self):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
def pipe_mk_markdown(self, img_parent_path: str):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
return md_content
......@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_txt_pdf
class TXTPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
super().__init__(pdf_bytes, model_list, image_writer, is_debug)
def pipe_classify(self):
pass
......@@ -15,10 +15,10 @@ class TXTPipe(AbsPipe):
def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
def pipe_mk_uni_format(self, img_parent_path: str):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
return content_list
def pipe_mk_markdown(self):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
def pipe_mk_markdown(self, img_parent_path: str):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
return md_content
......@@ -10,10 +10,8 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class UNIPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str,
is_debug: bool = False):
self.pdf_type = self.PIP_OCR
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
super().__init__(pdf_bytes, model_list, image_writer, is_debug)
def pipe_classify(self):
self.pdf_type = UNIPipe.classify(self.pdf_bytes)
......@@ -26,12 +24,12 @@ class UNIPipe(AbsPipe):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug)
def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
def pipe_mk_uni_format(self, img_parent_path: str):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
return content_list
def pipe_mk_markdown(self):
markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
def pipe_mk_markdown(self, img_parent_path: str):
markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
return markdown_content
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment