Unverified Commit f7847cad authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #44 from papayalove/master

更新了para_split
parents 1ceb456b 18164d5b
...@@ -171,6 +171,7 @@ def __valign_lines(blocks, layout_bboxes): ...@@ -171,6 +171,7 @@ def __valign_lines(blocks, layout_bboxes):
# 由于修改了block里的line长度,现在需要重新计算block的bbox # 由于修改了block里的line长度,现在需要重新计算block的bbox
for block in blocks_in_layoutbox: for block in blocks_in_layoutbox:
if len(block["lines"]) > 0:
block['bbox'] = [min([line['bbox'][0] for line in block['lines']]), block['bbox'] = [min([line['bbox'][0] for line in block['lines']]),
min([line['bbox'][1] for line in block['lines']]), min([line['bbox'][1] for line in block['lines']]),
max([line['bbox'][2] for line in block['lines']]), max([line['bbox'][2] for line in block['lines']]),
...@@ -192,7 +193,7 @@ def __align_text_in_layout(blocks, layout_bboxes): ...@@ -192,7 +193,7 @@ def __align_text_in_layout(blocks, layout_bboxes):
""" """
for layout in layout_bboxes: for layout in layout_bboxes:
lb = layout['layout_bbox'] lb = layout['layout_bbox']
blocks_in_layoutbox = [b for b in blocks if block["type"] == BlockType.Text and is_in_layout(b['bbox'], lb)] blocks_in_layoutbox = [block for block in blocks if block["type"] == BlockType.Text and is_in_layout(block['bbox'], lb)]
if len(blocks_in_layoutbox) == 0: if len(blocks_in_layoutbox) == 0:
continue continue
...@@ -270,6 +271,7 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en", char_avg ...@@ -270,6 +271,7 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en", char_avg
is_start_list = None is_start_list = None
is_end_list = None is_end_list = None
if len(blocks) == 0: if len(blocks) == 0:
list_info.append([False, False])
continue continue
if blocks[0]["type"] != BlockType.Text and blocks[-1]["type"] != BlockType.Text: if blocks[0]["type"] != BlockType.Text and blocks[-1]["type"] != BlockType.Text:
list_info.append([False, False]) list_info.append([False, False])
...@@ -360,6 +362,8 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info, ...@@ -360,6 +362,8 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
return blocks_group, [False, False] return blocks_group, [False, False]
for i in range(1, len(blocks_group)): for i in range(1, len(blocks_group)):
if len(blocks_group[i]) == 0 or len(blocks_group[i-1]) == 0:
continue
pre_layout_list_info = layout_list_info[i - 1] pre_layout_list_info = layout_list_info[i - 1]
next_layout_list_info = layout_list_info[i] next_layout_list_info = layout_list_info[i]
pre_last_para = blocks_group[i - 1][-1].get("lines", []) pre_last_para = blocks_group[i - 1][-1].get("lines", [])
...@@ -431,7 +435,7 @@ def __find_layout_bbox_by_line(line_bbox, layout_bboxes): ...@@ -431,7 +435,7 @@ def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
return None return None
def __connect_para_inter_layoutbox(layout_paras, blocks_group, new_layout_bbox, lang): def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox, lang):
""" """
layout之间进行分段。 layout之间进行分段。
主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。 主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
...@@ -479,7 +483,7 @@ def __connect_para_inter_layoutbox(layout_paras, blocks_group, new_layout_bbox, ...@@ -479,7 +483,7 @@ def __connect_para_inter_layoutbox(layout_paras, blocks_group, new_layout_bbox,
"""连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。""" """连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
#connected_layout_paras[-1][-1].extend(layout_paras[i][0]) #connected_layout_paras[-1][-1].extend(layout_paras[i][0])
connected_layout_blocks[-1][-1]["lines"].extend(blocks_group[i][0]["lines"]) connected_layout_blocks[-1][-1]["lines"].extend(blocks_group[i][0]["lines"])
layout_paras[i].pop(0) # 删除后一个layout的第一个段落, 因为他已经被合并到前一个layout的最后一个段落了。 #layout_paras[i].pop(0) # 删除后一个layout的第一个段落, 因为他已经被合并到前一个layout的最后一个段落了。
blocks_group[i][0]["lines"] = [] #删除后一个layout第一个段落中的lines,因为他已经被合并到前一个layout的最后一个段落了 blocks_group[i][0]["lines"] = [] #删除后一个layout第一个段落中的lines,因为他已经被合并到前一个layout的最后一个段落了
blocks_group[i][0]["lines_deleted"] = True blocks_group[i][0]["lines_deleted"] = True
# if len(layout_paras[i]) == 0: # if len(layout_paras[i]) == 0:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment