Commit 778b1fb7 authored by liukaiwen's avatar liukaiwen

更新了para_split

parent bb2bf065
...@@ -87,17 +87,21 @@ def __detect_list_lines(lines, new_layout_bboxes, lang): ...@@ -87,17 +87,21 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
""" """
for l in lines: for l in lines:
first_char = __get_span_text(l['spans'][0])[0] first_char = __get_span_text(l['spans'][0])[0]
layout_left = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)[0] layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
if l['bbox'][0] == layout_left: if not layout:
if first_char.isupper() or first_char.isdigit(): line_fea_encode.append(0)
line_fea_encode.append(1)
else:
line_fea_encode.append(4)
else: else:
if first_char.isupper(): layout_left = layout[0]
line_fea_encode.append(2) if l['bbox'][0] == layout_left:
if first_char.isupper() or first_char.isdigit():
line_fea_encode.append(1)
else:
line_fea_encode.append(4)
else: else:
line_fea_encode.append(3) if first_char.isupper():
line_fea_encode.append(2)
else:
line_fea_encode.append(3)
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。 # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment