Commit f702defe authored by kernel.h@qq.com's avatar kernel.h@qq.com

修正几个常量

parent 698c4a83
...@@ -78,9 +78,9 @@ class AbsPipe(ABC): ...@@ -78,9 +78,9 @@ class AbsPipe(ABC):
pdf_meta["text_layout_per_page"], pdf_meta["text_layout_per_page"],
) )
if is_text_pdf: if is_text_pdf:
return "txt" return AbsPipe.PIP_TXT
else: else:
return "ocr" return AbsPipe.PIP_OCR
@staticmethod @staticmethod
def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str) -> list: def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str) -> list:
...@@ -90,9 +90,9 @@ class AbsPipe(ABC): ...@@ -90,9 +90,9 @@ class AbsPipe(ABC):
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data) pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
parse_type = pdf_mid_data["_parse_type"] parse_type = pdf_mid_data["_parse_type"]
pdf_info_list = pdf_mid_data["pdf_info"] pdf_info_list = pdf_mid_data["pdf_info"]
if parse_type == "txt": if parse_type == AbsPipe.PIP_TXT:
content_list = mk_universal_format(pdf_info_list, img_buket_path) content_list = mk_universal_format(pdf_info_list, img_buket_path)
elif parse_type == "ocr": elif parse_type == AbsPipe.PIP_OCR:
content_list = make_standard_format_with_para(pdf_info_list, img_buket_path) content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
return content_list return content_list
...@@ -104,10 +104,10 @@ class AbsPipe(ABC): ...@@ -104,10 +104,10 @@ class AbsPipe(ABC):
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data) pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
parse_type = pdf_mid_data["_parse_type"] parse_type = pdf_mid_data["_parse_type"]
pdf_info_list = pdf_mid_data["pdf_info"] pdf_info_list = pdf_mid_data["pdf_info"]
if parse_type == "txt": if parse_type == AbsPipe.PIP_TXT:
content_list = mk_universal_format(pdf_info_list, img_buket_path) content_list = mk_universal_format(pdf_info_list, img_buket_path)
md_content = mk_mm_markdown(content_list) md_content = mk_mm_markdown(content_list)
elif parse_type == "ocr": elif parse_type == AbsPipe.PIP_OCR:
md_content = ocr_mk_mm_markdown_with_para(pdf_info_list, img_buket_path) md_content = ocr_mk_mm_markdown_with_para(pdf_info_list, img_buket_path)
return md_content return md_content
......
...@@ -19,6 +19,9 @@ from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr ...@@ -19,6 +19,9 @@ from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
PARSE_TYPE_TXT = "txt"
PARSE_TYPE_OCR = "ocr"
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
**kwargs): **kwargs):
""" """
...@@ -32,7 +35,7 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit ...@@ -32,7 +35,7 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
debug_mode=is_debug, debug_mode=is_debug,
) )
pdf_info_dict["parse_type"] = "txt" pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
return pdf_info_dict return pdf_info_dict
...@@ -50,7 +53,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit ...@@ -50,7 +53,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
debug_mode=is_debug, debug_mode=is_debug,
) )
pdf_info_dict["_parse_type"] = "ocr" pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
return pdf_info_dict return pdf_info_dict
...@@ -82,8 +85,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr ...@@ -82,8 +85,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
if pdf_info_dict is None: if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.") raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
else: else:
pdf_info_dict["_parse_type"] = "ocr" pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
else: else:
pdf_info_dict["_parse_type"] = "txt" pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
return pdf_info_dict return pdf_info_dict
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment