Unverified Commit 83b96f2f authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #79 from myhloli/master

fix UNIPipe and spans space with language
parents 80ca01bd e25a8499
...@@ -159,10 +159,10 @@ def merge_para_with_text(para_block): ...@@ -159,10 +159,10 @@ def merge_para_with_text(para_block):
content = f"\n$$\n{span['content']}\n$$\n" content = f"\n$$\n{span['content']}\n$$\n"
if content != '': if content != '':
if language == 'en': # 英文语境下 content间需要空格分隔 if 'zh' in language:
para_text += content + ' ' para_text += content # 中文语境下,content间不需要空格分隔
else: # 中文语境下,content间不需要空格分隔 else:
para_text += content para_text += content + ' ' # 英文语境下 content间需要空格分隔
return para_text return para_text
......
...@@ -10,11 +10,12 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf ...@@ -10,11 +10,12 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class UNIPipe(AbsPipe): class UNIPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False): def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False):
super().__init__(pdf_bytes, model_list, image_writer, is_debug) self.pdf_type = jso_useful_key["_pdf_type"]
super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug)
def pipe_classify(self): def pipe_classify(self):
self.pdf_type = UNIPipe.classify(self.pdf_bytes) self.pdf_type = AbsPipe.classify(self.pdf_bytes)
def pipe_parse(self): def pipe_parse(self):
if self.pdf_type == self.PIP_TXT: if self.pdf_type == self.PIP_TXT:
...@@ -46,14 +47,21 @@ if __name__ == '__main__': ...@@ -46,14 +47,21 @@ if __name__ == '__main__':
img_bucket_path = "imgs" img_bucket_path = "imgs"
img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path)) img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
pipe = UNIPipe(pdf_bytes, model_list, img_writer, img_bucket_path) # pdf_type = UNIPipe.classify(pdf_bytes)
# jso_useful_key = {
# "_pdf_type": pdf_type,
# "model_list": model_list
# }
jso_useful_key = {
"_pdf_type": "",
"model_list": model_list
}
pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
pipe.pipe_classify() pipe.pipe_classify()
pipe.pipe_parse() pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown() md_content = pipe.pipe_mk_markdown(img_bucket_path)
try: content_list = pipe.pipe_mk_uni_format(img_bucket_path)
content_list = pipe.pipe_mk_uni_format()
except Exception as e:
logger.exception(e)
md_writer = DiskReaderWriter(write_path) md_writer = DiskReaderWriter(write_path)
md_writer.write(md_content, "19983-00.md", AbsReaderWriter.MODE_TXT) md_writer.write(md_content, "19983-00.md", AbsReaderWriter.MODE_TXT)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment