Commit 40433aed authored by 赵小蒙's avatar 赵小蒙

fix cross page span drawing bbox logic

parent c8ab7913
from magic_pdf.libs.Constants import CROSS_PAGE
from magic_pdf.libs.commons import fitz # PyMuPDF
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
......@@ -148,6 +149,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
image_list = []
table_list = []
dropped_list = []
next_page_text_list = []
next_page_inline_equation_list = []
for page in pdf_info:
page_text_list = []
page_inline_equation_list = []
......@@ -155,6 +158,15 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
page_image_list = []
page_table_list = []
page_dropped_list = []
# 将跨页的span放到移动到下一页的列表中
if len(next_page_text_list) > 0:
page_text_list.extend(next_page_text_list)
next_page_text_list = []
if len(next_page_inline_equation_list) > 0:
page_inline_equation_list.extend(next_page_inline_equation_list)
next_page_inline_equation_list = []
# 构造dropped_list
for block in page["discarded_blocks"]:
if block["type"] == BlockType.Discarded:
......@@ -172,9 +184,15 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
for line in block["lines"]:
for span in line["spans"]:
if span["type"] == ContentType.Text:
page_text_list.append(span["bbox"])
if span.get(CROSS_PAGE, False):
next_page_text_list.append(span["bbox"])
else:
page_text_list.append(span["bbox"])
elif span["type"] == ContentType.InlineEquation:
page_inline_equation_list.append(span["bbox"])
if span.get(CROSS_PAGE, False):
next_page_inline_equation_list.append(span["bbox"])
else:
page_inline_equation_list.append(span["bbox"])
elif span["type"] == ContentType.InterlineEquation:
page_interline_equation_list.append(span["bbox"])
elif span["type"] == ContentType.Image:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment