Commit a0135640 authored by 赵小蒙's avatar 赵小蒙

修复spans为空list导致的IndexError: list index out of range

parent f10b4a50
...@@ -24,34 +24,37 @@ def line_sort_spans_by_left_to_right(lines): ...@@ -24,34 +24,37 @@ def line_sort_spans_by_left_to_right(lines):
return line_objects return line_objects
def merge_spans_to_line(spans): def merge_spans_to_line(spans):
# 按照y0坐标排序 if len(spans) == 0:
spans.sort(key=lambda span: span['bbox'][1]) return []
else:
lines = [] # 按照y0坐标排序
current_line = [spans[0]] spans.sort(key=lambda span: span['bbox'][1])
for span in spans[1:]:
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation" lines = []
# image和table类型,同上 current_line = [spans[0]]
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any( for span in spans[1:]:
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line): # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# 则开始新行 # image和table类型,同上
lines.append(current_line) if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
current_line = [span] s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
continue # 则开始新行
lines.append(current_line)
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行 current_line = [span]
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']): continue
current_line.append(span)
else: # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
# 否则,开始新行 if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
current_line.append(span)
else:
# 否则,开始新行
lines.append(current_line)
current_line = [span]
# 添加最后一行
if current_line:
lines.append(current_line) lines.append(current_line)
current_line = [span]
# 添加最后一行 return lines
if current_line:
lines.append(current_line)
return lines
def merge_spans_to_line_by_layout(spans, layout_bboxes): def merge_spans_to_line_by_layout(spans, layout_bboxes):
lines = [] lines = []
......
...@@ -77,70 +77,73 @@ def adjust_bbox_for_standalone_block(spans): ...@@ -77,70 +77,73 @@ def adjust_bbox_for_standalone_block(spans):
def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list): def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
# displayed_list = [] # displayed_list = []
# 如果spans为空,则不处理
if len(spans) == 0:
pass
else:
spans.sort(key=lambda span: span['bbox'][1])
lines = []
current_line = [spans[0]]
if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
displayed_list.append(spans[0])
line_first_y0 = spans[0]["bbox"][1]
line_first_y = spans[0]["bbox"][3]
# 用于给行间公式搜索
# text_inline_lines = []
for span in spans[1:]:
# if span.get("content","") == "78.":
# print("debug")
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
# 传入
if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
displayed_list.append(span)
# 则开始新行
lines.append(current_line)
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
current_line = [span]
line_first_y0 = span["bbox"][1]
line_first_y = span["bbox"][3]
continue
spans.sort(key=lambda span: span['bbox'][1]) # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
lines = [] if span["type"] == "text":
current_line = [spans[0]] line_first_y0 = span["bbox"][1]
if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]: line_first_y = span["bbox"][3]
displayed_list.append(spans[0]) current_line.append(span)
line_first_y0 = spans[0]["bbox"][1] else:
line_first_y = spans[0]["bbox"][3] # 否则,开始新行
# 用于给行间公式搜索 lines.append(current_line)
# text_inline_lines = []
for span in spans[1:]:
# if span.get("content","") == "78.":
# print("debug")
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
# 传入
if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
displayed_list.append(span)
# 则开始新行
lines.append(current_line)
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y))) text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
current_line = [span] current_line = [span]
line_first_y0 = span["bbox"][1]
line_first_y = span["bbox"][3]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
if span["type"] == "text":
line_first_y0 = span["bbox"][1] line_first_y0 = span["bbox"][1]
line_first_y = span["bbox"][3] line_first_y = span["bbox"][3]
current_line.append(span)
else: # 添加最后一行
# 否则,开始新行 if current_line:
lines.append(current_line) lines.append(current_line)
text_inline_lines.append((current_line, (line_first_y0, line_first_y))) if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
current_line = [span] text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
line_first_y0 = span["bbox"][1] for line in text_inline_lines:
line_first_y = span["bbox"][3] # 按照x0坐标排序
current_line = line[0]
# 添加最后一行 current_line.sort(key=lambda span: span['bbox'][0])
if current_line:
lines.append(current_line) # 调整每一个文字行内bbox统一
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]: for line in text_inline_lines:
text_inline_lines.append((current_line, (line_first_y0, line_first_y))) current_line, (line_first_y0, line_first_y) = line
for line in text_inline_lines: for span in current_line:
# 按照x0坐标排序 span["bbox"][1] = line_first_y0
current_line = line[0] span["bbox"][3] = line_first_y
current_line.sort(key=lambda span: span['bbox'][0])
# return spans, displayed_list, text_inline_lines
# 调整每一个文字行内bbox统一
for line in text_inline_lines:
current_line, (line_first_y0, line_first_y) = line
for span in current_line:
span["bbox"][1] = line_first_y0
span["bbox"][3] = line_first_y
# return spans, displayed_list, text_inline_lines
def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list): def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment