Commit 7d04ed6e authored by 赵小蒙's avatar 赵小蒙

add garbled_rate too large process logic

parent a3dc2cba
......@@ -78,9 +78,23 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
text_all = ""
for page_dict in pdf_info_dict['pdf_info']:
for para_block in page_dict['para_blocks']:
if para_block['type'] in ['title', 'text']:
for line in para_block['lines']:
for span in line['spans']:
text_all += span['content']
def calculate_garbled_rate(text):
printable = sum(1 for c in text if c.isprintable())
total = len(text)
if total == 0:
return 0 # 避免除以零的错误
return (total - printable) / total
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or calculate_garbled_rate(text_all) < 0.5:
logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment