Commit 40433aed authored by 赵小蒙's avatar 赵小蒙

fix cross page span drawing bbox logic

parent c8ab7913
from magic_pdf.libs.Constants import CROSS_PAGE
from magic_pdf.libs.commons import fitz # PyMuPDF from magic_pdf.libs.commons import fitz # PyMuPDF
from magic_pdf.libs.ocr_content_type import ContentType, BlockType from magic_pdf.libs.ocr_content_type import ContentType, BlockType
...@@ -148,6 +149,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path): ...@@ -148,6 +149,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
image_list = [] image_list = []
table_list = [] table_list = []
dropped_list = [] dropped_list = []
next_page_text_list = []
next_page_inline_equation_list = []
for page in pdf_info: for page in pdf_info:
page_text_list = [] page_text_list = []
page_inline_equation_list = [] page_inline_equation_list = []
...@@ -155,6 +158,15 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path): ...@@ -155,6 +158,15 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
page_image_list = [] page_image_list = []
page_table_list = [] page_table_list = []
page_dropped_list = [] page_dropped_list = []
# 将跨页的span放到移动到下一页的列表中
if len(next_page_text_list) > 0:
page_text_list.extend(next_page_text_list)
next_page_text_list = []
if len(next_page_inline_equation_list) > 0:
page_inline_equation_list.extend(next_page_inline_equation_list)
next_page_inline_equation_list = []
# 构造dropped_list # 构造dropped_list
for block in page["discarded_blocks"]: for block in page["discarded_blocks"]:
if block["type"] == BlockType.Discarded: if block["type"] == BlockType.Discarded:
...@@ -172,9 +184,15 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path): ...@@ -172,9 +184,15 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
for line in block["lines"]: for line in block["lines"]:
for span in line["spans"]: for span in line["spans"]:
if span["type"] == ContentType.Text: if span["type"] == ContentType.Text:
page_text_list.append(span["bbox"]) if span.get(CROSS_PAGE, False):
next_page_text_list.append(span["bbox"])
else:
page_text_list.append(span["bbox"])
elif span["type"] == ContentType.InlineEquation: elif span["type"] == ContentType.InlineEquation:
page_inline_equation_list.append(span["bbox"]) if span.get(CROSS_PAGE, False):
next_page_inline_equation_list.append(span["bbox"])
else:
page_inline_equation_list.append(span["bbox"])
elif span["type"] == ContentType.InterlineEquation: elif span["type"] == ContentType.InterlineEquation:
page_interline_equation_list.append(span["bbox"]) page_interline_equation_list.append(span["bbox"])
elif span["type"] == ContentType.Image: elif span["type"] == ContentType.Image:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment