Unverified Commit 863cd6c5 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub

Merge pull request #845 from myhloli/dev

feat(para_split_v3): improve list identification with block aspect ratio
parents e909145b cf0d76c0
...@@ -63,6 +63,7 @@ def __is_list_or_index_block(block): ...@@ -63,6 +63,7 @@ def __is_list_or_index_block(block):
first_line = block['lines'][0] first_line = block['lines'][0]
line_height = first_line['bbox'][3] - first_line['bbox'][1] line_height = first_line['bbox'][3] - first_line['bbox'][1]
block_weight = block['bbox_fs'][2] - block['bbox_fs'][0] block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
block_height = block['bbox_fs'][3] - block['bbox_fs'][1]
left_close_num = 0 left_close_num = 0
left_not_close_num = 0 left_not_close_num = 0
...@@ -86,10 +87,12 @@ def __is_list_or_index_block(block): ...@@ -86,10 +87,12 @@ def __is_list_or_index_block(block):
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2 line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2 block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
if (line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and if (
block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height): line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and
block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
):
external_sides_not_close_num += 1 external_sides_not_close_num += 1
if abs(line_mid_x - block_mid_x) < line_height/2: if abs(line_mid_x - block_mid_x) < line_height / 2:
center_close_num += 1 center_close_num += 1
line_text = "" line_text = ""
...@@ -142,7 +145,7 @@ def __is_list_or_index_block(block): ...@@ -142,7 +145,7 @@ def __is_list_or_index_block(block):
line_num_flag = True line_num_flag = True
# 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index
if ((left_close_num/len(block['lines']) >= 0.8 or right_close_num/len(block['lines']) >= 0.8) if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8)
and line_num_flag and line_num_flag
): ):
for line in block['lines']: for line in block['lines']:
...@@ -150,7 +153,13 @@ def __is_list_or_index_block(block): ...@@ -150,7 +153,13 @@ def __is_list_or_index_block(block):
return BlockType.Index return BlockType.Index
# 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近 # 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近
elif external_sides_not_close_num >= 2 and center_close_num == len(block['lines']) and external_sides_not_close_num / len(block['lines']) >= 0.5: # 补充条件block的长宽比有要求
elif (
external_sides_not_close_num >= 2 and
center_close_num == len(block['lines']) and
external_sides_not_close_num / len(block['lines']) >= 0.5 and
block_height / block_weight > 0.4
):
for line in block['lines']: for line in block['lines']:
line[ListLineTag.IS_LIST_START_LINE] = True line[ListLineTag.IS_LIST_START_LINE] = True
return BlockType.List return BlockType.List
...@@ -170,7 +179,7 @@ def __is_list_or_index_block(block): ...@@ -170,7 +179,7 @@ def __is_list_or_index_block(block):
if lines_text_list[i][-1] in LIST_END_FLAG: if lines_text_list[i][-1] in LIST_END_FLAG:
line[ListLineTag.IS_LIST_END_LINE] = True line[ListLineTag.IS_LIST_END_LINE] = True
if i + 1 < len(block['lines']): if i + 1 < len(block['lines']):
block['lines'][i+1][ListLineTag.IS_LIST_START_LINE] = True block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True
# line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end # line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end
else: else:
line_start_flag = False line_start_flag = False
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment