Commit d2cb75e8 authored by xuchao's avatar xuchao

利用下一行开头具有的空格特征分割段落

parent acabae56
......@@ -142,47 +142,51 @@ def __group_line_by_layout(blocks, layout_bboxes, lang="en"):
return lines_group
def __split_para_in_layoutbox(lines_group, layout_bboxes, lang="en", char_avg_len=10):
def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_len=10):
"""
lines_group 进行行分段——layout内部进行分段。
lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
1. 先计算每个group的左右边界。
2. 然后根据行末尾特征进行分段。
末尾特征:以句号等结束符结尾。并且距离右侧边界有一定距离。
且下一行开头不留空白。
"""
paras = []
right_tail_distance = 1.5 * char_avg_len
for lines in lines_group:
if len(lines)==0:
total_lines = len(lines)
if total_lines<=1: # 0行无需处理。1行无法分段。
continue
layout_right = max([line['bbox'][2] for line in lines])
#layout_right = max([line['bbox'][2] for line in lines])
layout_right = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[2]
para = [] # 元素是line
for line in lines:
line_text = ''.join([__get_span_text(span) for span in line['spans']])
#logger.info(line_text)
last_span_type = line['spans'][-1]['type']
if last_span_type in [TEXT, INLINE_EQUATION]:
last_char = line['spans'][-1]['content'][-1]
if last_char in LINE_STOP_FLAG or line['bbox'][2] < layout_right - right_tail_distance:
for i, line in enumerate(lines):
# 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断一下行结尾特征。
cur_line_type = line['spans'][-1]['type']
#cur_line_last_char = line['spans'][-1]['content'][-1]
next_line = lines[i+1] if i<total_lines-1 else None
if cur_line_type in [TEXT, INLINE_EQUATION]:
if line['bbox'][2] < layout_right - right_tail_distance:
para.append(line)
paras.append(para)
# para_text = ''.join([span['content'] for line in para for span in line['spans']])
# logger.info(para_text)
para = []
elif line['bbox'][2] >= layout_right - right_tail_distance and next_line and next_line['bbox'][0] == layout_right: # 现在这行到了行尾沾满,下一行存在且顶格。
para.append(line)
else:
para.append(line)
paras.append(para)
para = []
else: # 其他,图片、表格、行间公式,各自占一段
if len(para)>0: # 先把之前的段落加入到结果中
paras.append(para)
para = []
paras.append([line]) # 再把当前行加入到结果中。当前行为行间公式、图、表等。
para = []
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# logger.info(para_text)
if len(para)>0:
paras.append(para)
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# logger.info(para_text)
para = []
return paras
......@@ -285,7 +289,7 @@ def __do_split(blocks, layout_bboxes, new_layout_bbox, lang="en"):
4. 图、表,目前独占一行,不考虑分段。
"""
lines_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段
layout_paras = __split_para_in_layoutbox(lines_group, layout_bboxes, lang) # layout内分段
layout_paras = __split_para_in_layoutbox(lines_group, new_layout_bbox, lang) # layout内分段
connected_layout_paras = __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang) # layout间链接段落
return connected_layout_paras
......@@ -315,4 +319,4 @@ def para_split(pdf_info_dict, lang="en"):
is_conn= __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, lang)
if is_conn:
logger.info(f"连接了第{i-1}页和第{i}页的段落")
\ No newline at end of file
logger.info(f"连接了第{i-1}页和第{i}页的段落")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment