Commit f702defe authored by kernel.h@qq.com's avatar kernel.h@qq.com

修正几个常量

parent 698c4a83
......@@ -78,9 +78,9 @@ class AbsPipe(ABC):
pdf_meta["text_layout_per_page"],
)
if is_text_pdf:
return "txt"
return AbsPipe.PIP_TXT
else:
return "ocr"
return AbsPipe.PIP_OCR
@staticmethod
def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str) -> list:
......@@ -90,9 +90,9 @@ class AbsPipe(ABC):
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
parse_type = pdf_mid_data["_parse_type"]
pdf_info_list = pdf_mid_data["pdf_info"]
if parse_type == "txt":
if parse_type == AbsPipe.PIP_TXT:
content_list = mk_universal_format(pdf_info_list, img_buket_path)
elif parse_type == "ocr":
elif parse_type == AbsPipe.PIP_OCR:
content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
return content_list
......@@ -104,10 +104,10 @@ class AbsPipe(ABC):
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
parse_type = pdf_mid_data["_parse_type"]
pdf_info_list = pdf_mid_data["pdf_info"]
if parse_type == "txt":
if parse_type == AbsPipe.PIP_TXT:
content_list = mk_universal_format(pdf_info_list, img_buket_path)
md_content = mk_mm_markdown(content_list)
elif parse_type == "ocr":
elif parse_type == AbsPipe.PIP_OCR:
md_content = ocr_mk_mm_markdown_with_para(pdf_info_list, img_buket_path)
return md_content
......
......@@ -19,6 +19,9 @@ from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
PARSE_TYPE_TXT = "txt"
PARSE_TYPE_OCR = "ocr"
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
**kwargs):
"""
......@@ -32,7 +35,7 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
debug_mode=is_debug,
)
pdf_info_dict["parse_type"] = "txt"
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
return pdf_info_dict
......@@ -50,7 +53,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
debug_mode=is_debug,
)
pdf_info_dict["_parse_type"] = "ocr"
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
return pdf_info_dict
......@@ -82,8 +85,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
else:
pdf_info_dict["_parse_type"] = "ocr"
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
else:
pdf_info_dict["_parse_type"] = "txt"
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
return pdf_info_dict
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment