Commit c3b8f6d7 authored by kernel.h@qq.com's avatar kernel.h@qq.com

OCR line的左右侧如果超过layoutbox,那么让layoutbox截断左右侧

parent ec187a1d
...@@ -115,8 +115,8 @@ def ocr_parse_pdf_core(pdf_bytes, model_output_json_list, book_name, start_page_ ...@@ -115,8 +115,8 @@ def ocr_parse_pdf_core(pdf_bytes, model_output_json_list, book_name, start_page_
if __name__ == '__main__': if __name__ == '__main__':
pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf" pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json" json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_local_parse(pdf_path, json_file_path) ocr_local_parse(pdf_path, json_file_path)
book_name = "科数网/edu_00011318" # book_name = "科数网/edu_00011318"
ocr_online_parse(book_name) # ocr_online_parse(book_name)
pass pass
...@@ -183,11 +183,31 @@ def __valign_lines(blocks, layout_bboxes): ...@@ -183,11 +183,31 @@ def __valign_lines(blocks, layout_bboxes):
return new_layout_bboxes return new_layout_bboxes
def __align_text_in_layout(blocks, layout_bboxes):
"""
由于ocr出来的line,有时候会在前后有一段空白,这个时候需要对文本进行对齐,超出的部分被layout左右侧截断。
"""
for layout in layout_bboxes:
lb = layout['layout_bbox']
blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], lb)]
if len(blocks_in_layoutbox)==0:
continue
for block in blocks_in_layoutbox:
for line in block['lines']:
x0, x1 = line['bbox'][0], line['bbox'][2]
if x0 < lb[0]:
line['bbox'][0] = lb[0]
if x1 > lb[2]:
line['bbox'][2] = lb[2]
def __common_pre_proc(blocks, layout_bboxes): def __common_pre_proc(blocks, layout_bboxes):
""" """
不分语言的,对文本进行预处理 不分语言的,对文本进行预处理
""" """
#__add_line_period(blocks, layout_bboxes) #__add_line_period(blocks, layout_bboxes)
__align_text_in_layout(blocks, layout_bboxes)
aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes) aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
return aligned_layout_bboxes return aligned_layout_bboxes
...@@ -233,7 +253,6 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_ ...@@ -233,7 +253,6 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
layout_paras = [] layout_paras = []
right_tail_distance = 1.5 * char_avg_len right_tail_distance = 1.5 * char_avg_len
for lines in lines_group: for lines in lines_group:
paras = [] paras = []
total_lines = len(lines) total_lines = len(lines)
...@@ -575,8 +594,8 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang): ...@@ -575,8 +594,8 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
return connected_layout_paras, page_list_info return connected_layout_paras, page_list_info
def para_split(pdf_info_dict, debug_mode, lang="en"): def para_split(pdf_info_dict, debug_mode, lang="en"):
""" """
根据line和layout情况进行分段 根据line和layout情况进行分段
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment