Commit 330a7cd9 authored by liukaiwen's avatar liukaiwen

分段代码规范化

parent 97e67881
...@@ -258,29 +258,19 @@ def __pre_proc_en_blocks(blocks, layout_bboxes): ...@@ -258,29 +258,19 @@ def __pre_proc_en_blocks(blocks, layout_bboxes):
pass pass
def __group_line_by_layout(blocks, layout_bboxes, lang="en"): def __group_line_by_layout(blocks, layout_bboxes):
""" """
每个layout内的行进行聚合 每个layout内的行进行聚合
""" """
# 因为只是一个block一行目前, 一个block就是一个段落 # 因为只是一个block一行目前, 一个block就是一个段落
blocks_group = [] blocks_group = []
for lyout in layout_bboxes: for lyout in layout_bboxes:
#lines = [line for block in blocks if block["type"] == BlockType.Text and is_in_layout(block['bbox'], lyout['layout_bbox']) for line in
# block['lines']]
blocks_in_layout = [block for block in blocks if is_in_layout(block['bbox'], lyout['layout_bbox'])] blocks_in_layout = [block for block in blocks if is_in_layout(block['bbox'], lyout['layout_bbox'])]
blocks_group.append(blocks_in_layout) blocks_group.append(blocks_in_layout)
return blocks_group return blocks_group
def __split_para_in_layoutbox2(lines_group, new_layout_bbox, lang="en", char_avg_len=10): def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
"""
"""
def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en", char_avg_len=10):
""" """
lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。 lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
1. 先计算每个group的左右边界。 1. 先计算每个group的左右边界。
...@@ -329,9 +319,6 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en", char_avg ...@@ -329,9 +319,6 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en", char_avg
index = list_start[i] - 1 index = list_start[i] - 1
if "content" in lines[index]["spans"][-1]: if "content" in lines[index]["spans"][-1]:
lines[index]["spans"][-1]["content"] += '\n\n' lines[index]["spans"][-1]["content"] += '\n\n'
# layout_right = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[2]
# layout_left = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]
para = [] # 元素是line
layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾 layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
for content_type, start, end in text_segments: for content_type, start, end in text_segments:
if content_type == 'list': if content_type == 'list':
...@@ -340,7 +327,6 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en", char_avg ...@@ -340,7 +327,6 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en", char_avg
if end == total_lines - 1 and is_end_list is None: if end == total_lines - 1 and is_end_list is None:
layout_list_info[1] = True layout_list_info[1] = True
# paras = __split_para_lines(lines, text_blocks)
list_info.append(layout_list_info) list_info.append(layout_list_info)
return list_info return list_info
...@@ -472,7 +458,7 @@ def __find_layout_bbox_by_line(line_bbox, layout_bboxes): ...@@ -472,7 +458,7 @@ def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
return None return None
def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox, lang): def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
""" """
layout之间进行分段。 layout之间进行分段。
主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。 主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
...@@ -481,7 +467,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox, lang): ...@@ -481,7 +467,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox, lang):
2. 下一行开头不留空白。 2. 下一行开头不留空白。
""" """
connected_layout_paras = []
connected_layout_blocks = [] connected_layout_blocks = []
if len(blocks_group) == 0: if len(blocks_group) == 0:
return connected_layout_blocks return connected_layout_blocks
...@@ -689,11 +674,11 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang): ...@@ -689,11 +674,11 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
3. 参照上述行尾特征进行分段。 3. 参照上述行尾特征进行分段。
4. 图、表,目前独占一行,不考虑分段。 4. 图、表,目前独占一行,不考虑分段。
""" """
blocks_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段 blocks_group = __group_line_by_layout(blocks, layout_bboxes) # block内分段
layout_list_info = __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang) # layout内分段 layout_list_info = __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang) # layout内分段
blocks_group, page_list_info = __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info, blocks_group, page_list_info = __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
page_num, lang) # layout之间连接列表段落 page_num, lang) # layout之间连接列表段落
connected_layout_blocks = __connect_para_inter_layoutbox(blocks_group, new_layout_bbox, lang) # layout间链接段落 connected_layout_blocks = __connect_para_inter_layoutbox(blocks_group, new_layout_bbox) # layout间链接段落
return connected_layout_blocks, page_list_info return connected_layout_blocks, page_list_info
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment