Merge pull request #79 from myhloli/master

fix UNIPipe and spans space with language

Merge pull request #79 from myhloli/master
fix UNIPipe and spans space with language
83b96f2f · myhloli · GitHub · 80ca01bd · e25a8499 · 83b96f2f
Unverified Commit 83b96f2f authored Apr 29, 2024 by myhloli Committed by GitHub Apr 29, 2024
Show whitespace changes
Inline Side-by-side

Showing with 21 additions and 13 deletions

ocr_mkcontent.py magic_pdf/dict2md/ocr_mkcontent.py +4 -4

UNIPipe.py magic_pdf/pipe/UNIPipe.py +17 -9

No files found.
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -159,10 +159,10 @@ def merge_para_with_text(para_block):
                content = f"\n$$\n{span['content']}\n$$\n"
            if content != '':
-                if language == 'en':  # 英文语境下 content间需要空格分隔
+                if 'zh' in language:
-                    para_text += content + ' '
+                    para_text += content  # 中文语境下，content间不需要空格分隔
-                else:  # 中文语境下，content间不需要空格分隔
+                else:
-                    para_text += content
+                    para_text += content + ' '  # 英文语境下 content间需要空格分隔
    return para_text

--- a/magic_pdf/pipe/UNIPipe.py
+++ b/magic_pdf/pipe/UNIPipe.py
@@ -10,11 +10,12 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 class UNIPipe(AbsPipe):
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
+    def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False):
-        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
+        self.pdf_type = jso_useful_key["_pdf_type"]
+        super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug)
    def pipe_classify(self):
-        self.pdf_type = UNIPipe.classify(self.pdf_bytes)
+        self.pdf_type = AbsPipe.classify(self.pdf_bytes)
    def pipe_parse(self):
        if self.pdf_type == self.PIP_TXT:
@@ -46,14 +47,21 @@ if __name__ == '__main__':
    img_bucket_path = "imgs"
    img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
-    pipe = UNIPipe(pdf_bytes, model_list, img_writer, img_bucket_path)
+    # pdf_type = UNIPipe.classify(pdf_bytes)
+    # jso_useful_key = {
+    #     "_pdf_type": pdf_type,
+    #     "model_list": model_list
+    # }
+    jso_useful_key = {
+        "_pdf_type": "",
+        "model_list": model_list
+    }
+    pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
    pipe.pipe_classify()
    pipe.pipe_parse()
-    md_content = pipe.pipe_mk_markdown()
+    md_content = pipe.pipe_mk_markdown(img_bucket_path)
-    try:
+    content_list = pipe.pipe_mk_uni_format(img_bucket_path)
-        content_list = pipe.pipe_mk_uni_format()
-    except Exception as e:
-        logger.exception(e)
    md_writer = DiskReaderWriter(write_path)
    md_writer.write(md_content, "19983-00.md", AbsReaderWriter.MODE_TXT)