add garbled_rate too large process logic

7d04ed6e · 赵小蒙 · a3dc2cba · 7d04ed6e
Commit 7d04ed6e authored May 24, 2024 by 赵小蒙
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 3 deletions

user_api.py magic_pdf/user_api.py +17 -3

No files found.
--- a/magic_pdf/user_api.py
+++ b/magic_pdf/user_api.py
@@ -78,9 +78,23 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
            return None

    pdf_info_dict = parse_pdf(parse_pdf_by_txt)
-
-    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
-        logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
+    text_all = ""
+    for page_dict in pdf_info_dict['pdf_info']:
+        for para_block in page_dict['para_blocks']:
+            if para_block['type'] in ['title', 'text']:
+                for line in para_block['lines']:
+                    for span in line['spans']:
+                        text_all += span['content']
+
+    def calculate_garbled_rate(text):
+        printable = sum(1 for c in text if c.isprintable())
+        total = len(text)
+        if total == 0:
+            return 0  # 避免除以零的错误
+        return (total - printable) / total
+
+    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or calculate_garbled_rate(text_all) < 0.5:
+        logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
        pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
        if pdf_info_dict is None:
            raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")