Commit df14c61f authored by 赵小蒙's avatar 赵小蒙

update: Enhance the capability to detect garbled document issues

parent 89d7964c
...@@ -305,7 +305,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list): ...@@ -305,7 +305,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
text_layout_list: list): text_layout_list: list, invalid_chars: bool):
""" """
这里的图片和页面长度单位是pts 这里的图片和页面长度单位是pts
:param total_page: :param total_page:
...@@ -322,7 +322,8 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l ...@@ -322,7 +322,8 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
'by_avg_words': classify_by_avg_words(text_len_list), 'by_avg_words': classify_by_avg_words(text_len_list),
'by_img_num': classify_by_img_num(img_sz_list, img_num_list), 'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
'by_text_layout': classify_by_text_layout(text_layout_list), 'by_text_layout': classify_by_text_layout(text_layout_list),
'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list) 'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
'by_invalid_chars': invalid_chars,
} }
if all(results.values()): if all(results.values()):
...@@ -331,7 +332,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l ...@@ -331,7 +332,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
return False, results return False, results
else: else:
logger.warning( logger.warning(
f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
f" by_invalid_chars: {results['by_invalid_chars']}",
file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法 file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
return False, results return False, results
......
...@@ -12,12 +12,13 @@ from collections import Counter ...@@ -12,12 +12,13 @@ from collections import Counter
from magic_pdf.libs.drop_reason import DropReason from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.language import detect_lang from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.pdf_check import detect_invalid_chars
scan_max_page = 50 scan_max_page = 50
junk_limit_min = 10 junk_limit_min = 10
def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_pts): def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
result] result]
page_area = int(page_width_pts) * int(page_height_pts) page_area = int(page_width_pts) * int(page_height_pts)
...@@ -25,14 +26,15 @@ def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_p ...@@ -25,14 +26,15 @@ def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_p
max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6] max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
return max_image_area_per_page return max_image_area_per_page
def process_image(page, junk_img_bojids=[]): def process_image(page, junk_img_bojids=[]):
page_result = []# 存每个页面里的多张图四元组信息 page_result = [] # 存每个页面里的多张图四元组信息
items = page.get_images() items = page.get_images()
dedup = set() dedup = set()
for img in items: for img in items:
# 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是 # 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
img_bojid = img[0]# 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等 img_bojid = img[0] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
if img_bojid in junk_img_bojids:# 如果是垃圾图像,就跳过 if img_bojid in junk_img_bojids: # 如果是垃圾图像,就跳过
continue continue
recs = page.get_image_rects(img, transform=True) recs = page.get_image_rects(img, transform=True)
if recs: if recs:
...@@ -47,6 +49,8 @@ def process_image(page, junk_img_bojids=[]): ...@@ -47,6 +49,8 @@ def process_image(page, junk_img_bojids=[]):
dedup.add((x0, y0, x1, y1, img_bojid)) dedup.add((x0, y0, x1, y1, img_bojid))
page_result.append([x0, y0, x1, y1, img_bojid]) page_result.append([x0, y0, x1, y1, img_bojid])
return page_result return page_result
def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list: def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
""" """
返回每个页面里的图片的四元组,每个页面多个图片。 返回每个页面里的图片的四元组,每个页面多个图片。
...@@ -57,7 +61,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list: ...@@ -57,7 +61,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images()) img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
# 找出出现次数超过 len(doc) 半数的 img_bojid # 找出出现次数超过 len(doc) 半数的 img_bojid
junk_limit = max(len(doc)*0.5, junk_limit_min)# 对一些页数比较少的进行豁免 junk_limit = max(len(doc) * 0.5, junk_limit_min) # 对一些页数比较少的进行豁免
junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit] junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
...@@ -82,9 +86,10 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list: ...@@ -82,9 +86,10 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
result.append(page_result) result.append(page_result)
for item in result: for item in result:
if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版 if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:# 如果是特殊文字版,就把junklist置空并break if max(imgs_len_list) == min(imgs_len_list) and max(
imgs_len_list) >= junk_limit_min: # 如果是特殊文字版,就把junklist置空并break
junk_img_bojids = [] junk_img_bojids = []
else:# 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist else: # 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
pass pass
break_loop = True break_loop = True
break break
...@@ -101,9 +106,9 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list: ...@@ -101,9 +106,9 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts) max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空 if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
junk_img_bojids = [] junk_img_bojids = []
else:# 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist else: # 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
pass pass
else:# 每页图片数量不一致,需要清掉junklist全量跑前50页图片 else: # 每页图片数量不一致,需要清掉junklist全量跑前50页图片
junk_img_bojids = [] junk_img_bojids = []
#正式进入取前50页图片的信息流程 #正式进入取前50页图片的信息流程
...@@ -136,7 +141,6 @@ def get_pdf_page_size_pts(doc: fitz.Document): ...@@ -136,7 +141,6 @@ def get_pdf_page_size_pts(doc: fitz.Document):
median_width = page_width_list[len(page_width_list) // 2] median_width = page_width_list[len(page_width_list) // 2]
median_height = page_height_list[len(page_height_list) // 2] median_height = page_height_list[len(page_height_list) // 2]
return median_width, median_height return median_width, median_height
...@@ -156,6 +160,7 @@ def get_pdf_textlen_per_page(doc: fitz.Document): ...@@ -156,6 +160,7 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
return text_len_lst return text_len_lst
def get_pdf_text_layout_per_page(doc: fitz.Document): def get_pdf_text_layout_per_page(doc: fitz.Document):
""" """
根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。 根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
...@@ -233,11 +238,16 @@ def get_pdf_text_layout_per_page(doc: fitz.Document): ...@@ -233,11 +238,16 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
# logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}") # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
return text_layout_list return text_layout_list
'''定义一个自定义异常用来抛出单页svg太多的pdf''' '''定义一个自定义异常用来抛出单页svg太多的pdf'''
class PageSvgsTooManyError(Exception): class PageSvgsTooManyError(Exception):
def __init__(self, message="Page SVGs are too many"): def __init__(self, message="Page SVGs are too many"):
self.message = message self.message = message
super().__init__(self.message) super().__init__(self.message)
def get_svgs_per_page(doc: fitz.Document): def get_svgs_per_page(doc: fitz.Document):
svgs_len_list = [] svgs_len_list = []
for page_id, page in enumerate(doc): for page_id, page in enumerate(doc):
...@@ -251,6 +261,7 @@ def get_svgs_per_page(doc: fitz.Document): ...@@ -251,6 +261,7 @@ def get_svgs_per_page(doc: fitz.Document):
# logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}") # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
return svgs_len_list return svgs_len_list
def get_imgs_per_page(doc: fitz.Document): def get_imgs_per_page(doc: fitz.Document):
imgs_len_list = [] imgs_len_list = []
for page_id, page in enumerate(doc): for page_id, page in enumerate(doc):
...@@ -287,6 +298,13 @@ def get_language(doc: fitz.Document): ...@@ -287,6 +298,13 @@ def get_language(doc: fitz.Document):
return language return language
def check_invalid_chars(pdf_bytes):
"""
乱码检测
"""
return detect_invalid_chars(pdf_bytes)
def pdf_meta_scan(pdf_bytes: bytes): def pdf_meta_scan(pdf_bytes: bytes):
""" """
:param s3_pdf_path: :param s3_pdf_path:
...@@ -318,7 +336,8 @@ def pdf_meta_scan(pdf_bytes: bytes): ...@@ -318,7 +336,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
# logger.info(f"text_layout_per_page: {text_layout_per_page}") # logger.info(f"text_layout_per_page: {text_layout_per_page}")
text_language = get_language(doc) text_language = get_language(doc)
# logger.info(f"text_language: {text_language}") # logger.info(f"text_language: {text_language}")
invalid_chars = check_invalid_chars(pdf_bytes)
# logger.info(f"invalid_chars: {invalid_chars}")
# 最后输出一条json # 最后输出一条json
res = { res = {
...@@ -334,6 +353,7 @@ def pdf_meta_scan(pdf_bytes: bytes): ...@@ -334,6 +353,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
# "svgs_per_page": svgs_per_page, # "svgs_per_page": svgs_per_page,
"imgs_per_page": imgs_per_page, # 增加每页img数量list "imgs_per_page": imgs_per_page, # 增加每页img数量list
"junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list "junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list
"invalid_chars": invalid_chars,
"metadata": doc.metadata "metadata": doc.metadata
} }
# logger.info(json.dumps(res, ensure_ascii=False)) # logger.info(json.dumps(res, ensure_ascii=False))
......
from io import BytesIO
import re
import fitz
import numpy as np
from loguru import logger
from pdfminer.high_level import extract_text
def calculate_sample_count(total_page: int, sample_ratio=0.1):
"""
根据总页数和采样率计算采样页面的数量。
"""
select_page_cnt = int(total_page * sample_ratio)
if select_page_cnt < 5:
select_page_cnt = min(10, total_page)
elif select_page_cnt > 10:
select_page_cnt = 10
return select_page_cnt
def extract_pages(src_pdf_bytes: bytes):
pdf_docs = fitz.open("pdf", src_pdf_bytes)
total_page = len(pdf_docs)
if total_page == 0:
# 如果PDF没有页面,直接返回空文档
logger.warning("PDF is empty, return empty document")
return fitz.Document()
select_page_cnt = calculate_sample_count(total_page)
page_num = np.random.choice(total_page, select_page_cnt, replace=False)
sample_docs = fitz.Document()
try:
for index in page_num:
sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
except Exception as e:
logger.exception(e)
return sample_docs
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
""""
检测PDF中是否包含非法字符
"""
'''需要使用'''
sample_docs = extract_pages(src_pdf_bytes)
sample_pdf_bytes = sample_docs.tobytes()
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
text = extract_text(sample_pdf_file_like_object)
# logger.info(text)
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
cid_pattern = re.compile(r'\(cid:\d+\)')
matches = cid_pattern.findall(text)
cid_count = len(matches)
text_len = len(text)
logger.info(f"cid_count: {cid_count}, text_len: {text_len}")
if cid_count > 10:
return False # 乱码文档
else:
return True # 正常文档
...@@ -83,6 +83,7 @@ class AbsPipe(ABC): ...@@ -83,6 +83,7 @@ class AbsPipe(ABC):
pdf_meta["text_len_per_page"], pdf_meta["text_len_per_page"],
pdf_meta["imgs_per_page"], pdf_meta["imgs_per_page"],
pdf_meta["text_layout_per_page"], pdf_meta["text_layout_per_page"],
pdf_meta["invalid_chars"],
) )
if is_text_pdf: if is_text_pdf:
return AbsPipe.PIP_TXT return AbsPipe.PIP_TXT
......
...@@ -86,45 +86,46 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr ...@@ -86,45 +86,46 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
return None return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt) pdf_info_dict = parse_pdf(parse_pdf_by_txt)
text_all = "" # text_all = ""
for page_dict in pdf_info_dict['pdf_info']: # for page_dict in pdf_info_dict['pdf_info']:
for para_block in page_dict['para_blocks']: # for para_block in page_dict['para_blocks']:
if para_block['type'] in ['title', 'text']: # if para_block['type'] in ['title', 'text']:
for line in para_block['lines']: # for line in para_block['lines']:
for span in line['spans']: # for span in line['spans']:
text_all += span['content'] # text_all += span['content']
def calculate_not_common_character_rate(text): # def calculate_not_common_character_rate(text):
garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]') # garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
# 计算乱码字符的数量 # # 计算乱码字符的数量
garbage_count = len(garbage_regex.findall(text)) # garbage_count = len(garbage_regex.findall(text))
total = len(text) # total = len(text)
if total == 0: # if total == 0:
return 0 # 避免除以零的错误 # return 0 # 避免除以零的错误
return garbage_count / total # return garbage_count / total
#
def calculate_not_printable_rate(text): # def calculate_not_printable_rate(text):
printable_text = "" # printable_text = ""
for c in text: # for c in text:
if c.isprintable(): # if c.isprintable():
printable_text += c # printable_text += c
printable_total = len(printable_text) # printable_total = len(printable_text)
total = len(text) # total = len(text)
if total == 0: # if total == 0:
return 0 # 避免除以零的错误 # return 0 # 避免除以零的错误
return (total - printable_total) / total # return (total - printable_total) / total
#
not_common_character_rate = calculate_not_common_character_rate(text_all) # not_common_character_rate = calculate_not_common_character_rate(text_all)
not_printable_rate = calculate_not_printable_rate(text_all) # not_printable_rate = calculate_not_printable_rate(text_all)
pdf_info_dict["_not_common_character_rate"] = not_common_character_rate # pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
pdf_info_dict["_not_printable_rate"] = not_printable_rate # pdf_info_dict["_not_printable_rate"] = not_printable_rate
logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}") # logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
'''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
# not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好 # not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
if (pdf_info_dict is None if (pdf_info_dict is None
or pdf_info_dict.get("_need_drop", False) or pdf_info_dict.get("_need_drop", False)
or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02 # or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
): ):
logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr") logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
if input_model_is_empty: if input_model_is_empty:
pdf_models = doc_analyze(pdf_bytes, ocr=True) pdf_models = doc_analyze(pdf_bytes, ocr=True)
pdf_info_dict = parse_pdf(parse_pdf_by_ocr) pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
......
...@@ -15,3 +15,4 @@ scikit-learn>=1.0.2 ...@@ -15,3 +15,4 @@ scikit-learn>=1.0.2
nltk==3.8.1 nltk==3.8.1
s3pathlib>=2.1.1 s3pathlib>=2.1.1
paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
pdfminer.six>=20231228
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment