Commit 8a179269 authored by 赵小蒙's avatar 赵小蒙

update draw_span_bbox logic

parent 413a9df2
...@@ -151,6 +151,25 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path): ...@@ -151,6 +151,25 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
dropped_list = [] dropped_list = []
next_page_text_list = [] next_page_text_list = []
next_page_inline_equation_list = [] next_page_inline_equation_list = []
def get_span_info(span):
if span["type"] == ContentType.Text:
if span.get(CROSS_PAGE, False):
next_page_text_list.append(span["bbox"])
else:
page_text_list.append(span["bbox"])
elif span["type"] == ContentType.InlineEquation:
if span.get(CROSS_PAGE, False):
next_page_inline_equation_list.append(span["bbox"])
else:
page_inline_equation_list.append(span["bbox"])
elif span["type"] == ContentType.InterlineEquation:
page_interline_equation_list.append(span["bbox"])
elif span["type"] == ContentType.Image:
page_image_list.append(span["bbox"])
elif span["type"] == ContentType.Table:
page_table_list.append(span["bbox"])
for page in pdf_info: for page in pdf_info:
page_text_list = [] page_text_list = []
page_inline_equation_list = [] page_inline_equation_list = []
...@@ -162,10 +181,10 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path): ...@@ -162,10 +181,10 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
# 将跨页的span放到移动到下一页的列表中 # 将跨页的span放到移动到下一页的列表中
if len(next_page_text_list) > 0: if len(next_page_text_list) > 0:
page_text_list.extend(next_page_text_list) page_text_list.extend(next_page_text_list)
next_page_text_list = [] next_page_text_list.clear()
if len(next_page_inline_equation_list) > 0: if len(next_page_inline_equation_list) > 0:
page_inline_equation_list.extend(next_page_inline_equation_list) page_inline_equation_list.extend(next_page_inline_equation_list)
next_page_inline_equation_list = [] next_page_inline_equation_list.clear()
# 构造dropped_list # 构造dropped_list
for block in page["discarded_blocks"]: for block in page["discarded_blocks"]:
...@@ -183,36 +202,12 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path): ...@@ -183,36 +202,12 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
]: ]:
for line in block["lines"]: for line in block["lines"]:
for span in line["spans"]: for span in line["spans"]:
if span["type"] == ContentType.Text: get_span_info(span)
if span.get(CROSS_PAGE, False):
next_page_text_list.append(span["bbox"])
else:
page_text_list.append(span["bbox"])
elif span["type"] == ContentType.InlineEquation:
if span.get(CROSS_PAGE, False):
next_page_inline_equation_list.append(span["bbox"])
else:
page_inline_equation_list.append(span["bbox"])
elif span["type"] == ContentType.InterlineEquation:
page_interline_equation_list.append(span["bbox"])
elif span["type"] == ContentType.Image:
page_image_list.append(span["bbox"])
elif span["type"] == ContentType.Table:
page_table_list.append(span["bbox"])
elif block["type"] in [BlockType.Image, BlockType.Table]: elif block["type"] in [BlockType.Image, BlockType.Table]:
for sub_block in block["blocks"]: for sub_block in block["blocks"]:
for line in sub_block["lines"]: for line in sub_block["lines"]:
for span in line["spans"]: for span in line["spans"]:
if span["type"] == ContentType.Text: get_span_info(span)
page_text_list.append(span["bbox"])
elif span["type"] == ContentType.InlineEquation:
page_inline_equation_list.append(span["bbox"])
elif span["type"] == ContentType.InterlineEquation:
page_interline_equation_list.append(span["bbox"])
elif span["type"] == ContentType.Image:
page_image_list.append(span["bbox"])
elif span["type"] == ContentType.Table:
page_table_list.append(span["bbox"])
text_list.append(page_text_list) text_list.append(page_text_list)
inline_equation_list.append(page_inline_equation_list) inline_equation_list.append(page_inline_equation_list)
interline_equation_list.append(page_interline_equation_list) interline_equation_list.append(page_interline_equation_list)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment