ocr_mk_mm_markdown_with_para_and_pagination逻辑更新

dbe79ba1 · 赵小蒙 · f36c2656 · dbe79ba1
Commit dbe79ba1 authored Mar 22, 2024 by 赵小蒙
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 19 deletions

ocr_mkcontent.py magic_pdf/dict2md/ocr_mkcontent.py +20 -19

No files found.
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -99,28 +99,29 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
    markdown_with_para_and_pagination = []
    for page_no, page_info in pdf_info_dict.items():
        page_markdown = []
-        paras = page_info.get("para_blocks")
-        if not paras:
+        paras_of_layout = page_info.get("para_blocks")
+        if not paras_of_layout:
            continue
-        for para in paras:
-            para_text = ''
-            for line in para:
-                for span in line['spans']:
-                    span_type = span.get('type')
-                    if span_type == ContentType.Text:
-                        content = split_long_words(span['content'])
-                        # content = span['content']
-                    elif span_type == ContentType.InlineEquation:
-                        content = f"${span['content']}$"
-                    elif span_type == ContentType.InterlineEquation:
-                        content = f"\n$$\n{span['content']}\n$$\n"
-                    elif span_type in [ContentType.Image, ContentType.Table]:
-                        content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
-                    para_text += content + ' '
-            page_markdown.append(para_text.strip() + '  ')
+        for paras in paras_of_layout:
+            for para in paras:
+                para_text = ''
+                for line in para:
+                    for span in line['spans']:
+                        span_type = span.get('type')
+                        if span_type == ContentType.Text:
+                            content = split_long_words(span['content'])
+                            # content = span['content']
+                        elif span_type == ContentType.InlineEquation:
+                            content = f"${span['content']}$"
+                        elif span_type == ContentType.InterlineEquation:
+                            content = f"\n$$\n{span['content']}\n$$\n"
+                        elif span_type in [ContentType.Image, ContentType.Table]:
+                            content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
+                        para_text += content + ' '
+                page_markdown.append(para_text.strip() + '  ')
        markdown_with_para_and_pagination.append({
            'page_no': page_no,
-            'md': '\n\n'.join(page_markdown)
+            'md_content': '\n\n'.join(page_markdown)
        })
    return markdown_with_para_and_pagination