Commit a8f2e7d6 authored by myhloli's avatar myhloli

fix(list): improve list identification accuracy- Adjust the threshold for...

fix(list): improve list identification accuracy- Adjust the threshold for determining right-side spacing to 0.26 * block_weight
- Add TODO comment for special list identification with all centered lines- Modify the condition for recognizing short item lists with left alignment
- Update the condition for identifying the end of a list item
parent 87b9eeee
...@@ -103,7 +103,7 @@ def __is_list_or_index_block(block): ...@@ -103,7 +103,7 @@ def __is_list_or_index_block(block):
right_close_num += 1 right_close_num += 1
else: else:
# 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值 # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
closed_area = 0.3 * block_weight closed_area = 0.26 * block_weight
# closed_area = 5 * line_height # closed_area = 5 * line_height
if block['bbox_fs'][2] - line['bbox'][2] > closed_area: if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
right_not_close_num += 1 right_not_close_num += 1
...@@ -139,10 +139,12 @@ def __is_list_or_index_block(block): ...@@ -139,10 +139,12 @@ def __is_list_or_index_block(block):
line[ListLineTag.IS_LIST_START_LINE] = True line[ListLineTag.IS_LIST_START_LINE] = True
return BlockType.Index return BlockType.Index
# @TODO 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近
elif left_close_num >= 2 and ( elif left_close_num >= 2 and (
right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and not multiple_para_flag: right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and not multiple_para_flag:
# 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾 # 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾
if left_close_num / len(block['lines']) > 0.9: if left_close_num / len(block['lines']) > 0.8:
# 这种是每个item只有一行,且左边都贴边的短item list # 这种是每个item只有一行,且左边都贴边的短item list
if flag_end_count == 0 and right_close_num / len(block['lines']) < 0.5: if flag_end_count == 0 and right_close_num / len(block['lines']) < 0.5:
for line in block['lines']: for line in block['lines']:
...@@ -162,7 +164,8 @@ def __is_list_or_index_block(block): ...@@ -162,7 +164,8 @@ def __is_list_or_index_block(block):
if line_start_flag: if line_start_flag:
line[ListLineTag.IS_LIST_START_LINE] = True line[ListLineTag.IS_LIST_START_LINE] = True
line_start_flag = False line_start_flag = False
elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height: # elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight:
line[ListLineTag.IS_LIST_END_LINE] = True line[ListLineTag.IS_LIST_END_LINE] = True
line_start_flag = True line_start_flag = True
# 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致 # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment