Unverified Commit 683fa633 authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #77 from myhloli/master

fix
parents 43d1d525 6e2f3097
...@@ -60,11 +60,11 @@ def prepare_env(pdf_file_name, method): ...@@ -60,11 +60,11 @@ def prepare_env(pdf_file_name, method):
def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir): def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
if parse_method == "auto": if parse_method == "auto":
pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True) pipe = UNIPipe(pdf_bytes, model_list, image_writer, is_debug=True)
elif parse_method == "txt": elif parse_method == "txt":
pipe = TXTPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True) pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
elif parse_method == "ocr": elif parse_method == "ocr":
pipe = OCRPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True) pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
else: else:
print("unknow parse method") print("unknow parse method")
os.exit(1) os.exit(1)
...@@ -74,7 +74,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, ...@@ -74,7 +74,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
pdf_info = pipe.pdf_mid_data['pdf_info'] pdf_info = pipe.pdf_mid_data['pdf_info']
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir) draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir) draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
md_content = pipe.pipe_mk_markdown() md_content = pipe.pipe_mk_markdown(image_dir)
#part_file_name = datetime.now().strftime("%H-%M-%S") #part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer.write( md_writer.write(
content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
...@@ -85,7 +85,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, ...@@ -85,7 +85,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
mode=AbsReaderWriter.MODE_TXT, mode=AbsReaderWriter.MODE_TXT,
) )
try: try:
content_list = pipe.pipe_mk_uni_format() content_list = pipe.pipe_mk_uni_format(image_dir)
except Exception as e: except Exception as e:
logger.exception(e) logger.exception(e)
md_writer.write( md_writer.write(
......
...@@ -305,7 +305,7 @@ def pdf_meta_scan(pdf_bytes: bytes): ...@@ -305,7 +305,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
page_width_pts, page_height_pts = get_pdf_page_size_pts(doc) page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
# logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}") # logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
svgs_per_page = get_svgs_per_page(doc) # svgs_per_page = get_svgs_per_page(doc)
# logger.info(f"svgs_per_page: {svgs_per_page}") # logger.info(f"svgs_per_page: {svgs_per_page}")
imgs_per_page = get_imgs_per_page(doc) imgs_per_page = get_imgs_per_page(doc)
# logger.info(f"imgs_per_page: {imgs_per_page}") # logger.info(f"imgs_per_page: {imgs_per_page}")
...@@ -331,7 +331,7 @@ def pdf_meta_scan(pdf_bytes: bytes): ...@@ -331,7 +331,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
"text_len_per_page": text_len_per_page, "text_len_per_page": text_len_per_page,
"text_layout_per_page": text_layout_per_page, "text_layout_per_page": text_layout_per_page,
"text_language": text_language, "text_language": text_language,
"svgs_per_page": svgs_per_page, # "svgs_per_page": svgs_per_page,
"imgs_per_page": imgs_per_page, # 增加每页img数量list "imgs_per_page": imgs_per_page, # 增加每页img数量list
"junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list "junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list
"metadata": doc.metadata "metadata": doc.metadata
......
...@@ -16,11 +16,10 @@ class AbsPipe(ABC): ...@@ -16,11 +16,10 @@ class AbsPipe(ABC):
PIP_OCR = "ocr" PIP_OCR = "ocr"
PIP_TXT = "txt" PIP_TXT = "txt"
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path:str, is_debug:bool=False): def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
self.pdf_bytes = pdf_bytes self.pdf_bytes = pdf_bytes
self.model_list = model_list self.model_list = model_list
self.image_writer = image_writer self.image_writer = image_writer
self.img_parent_path = img_parent_path
self.pdf_mid_data = None # 未压缩 self.pdf_mid_data = None # 未压缩
self.is_debug = is_debug self.is_debug = is_debug
......
...@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_ocr_pdf ...@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_ocr_pdf
class OCRPipe(AbsPipe): class OCRPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False): def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug) super().__init__(pdf_bytes, model_list, image_writer, is_debug)
def pipe_classify(self): def pipe_classify(self):
pass pass
...@@ -15,10 +15,10 @@ class OCRPipe(AbsPipe): ...@@ -15,10 +15,10 @@ class OCRPipe(AbsPipe):
def pipe_parse(self): def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug) self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self): def pipe_mk_uni_format(self, img_parent_path: str):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path) content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
return content_list return content_list
def pipe_mk_markdown(self): def pipe_mk_markdown(self, img_parent_path: str):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path) md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
return md_content return md_content
...@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_txt_pdf ...@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_txt_pdf
class TXTPipe(AbsPipe): class TXTPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False): def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug) super().__init__(pdf_bytes, model_list, image_writer, is_debug)
def pipe_classify(self): def pipe_classify(self):
pass pass
...@@ -15,10 +15,10 @@ class TXTPipe(AbsPipe): ...@@ -15,10 +15,10 @@ class TXTPipe(AbsPipe):
def pipe_parse(self): def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug) self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self): def pipe_mk_uni_format(self, img_parent_path: str):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path) content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
return content_list return content_list
def pipe_mk_markdown(self): def pipe_mk_markdown(self, img_parent_path: str):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path) md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
return md_content return md_content
...@@ -10,10 +10,8 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf ...@@ -10,10 +10,8 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class UNIPipe(AbsPipe): class UNIPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
is_debug: bool = False): super().__init__(pdf_bytes, model_list, image_writer, is_debug)
self.pdf_type = self.PIP_OCR
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
def pipe_classify(self): def pipe_classify(self):
self.pdf_type = UNIPipe.classify(self.pdf_bytes) self.pdf_type = UNIPipe.classify(self.pdf_bytes)
...@@ -26,12 +24,12 @@ class UNIPipe(AbsPipe): ...@@ -26,12 +24,12 @@ class UNIPipe(AbsPipe):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug) is_debug=self.is_debug)
def pipe_mk_uni_format(self): def pipe_mk_uni_format(self, img_parent_path: str):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path) content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
return content_list return content_list
def pipe_mk_markdown(self): def pipe_mk_markdown(self, img_parent_path: str):
markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path) markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
return markdown_content return markdown_content
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment