Commit d4f96a05 authored by 赵小蒙's avatar 赵小蒙

add discarded spans drawing

parent 4adc761b
...@@ -65,14 +65,8 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config): ...@@ -65,14 +65,8 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
def draw_layout_bbox(pdf_info, pdf_bytes, out_path): def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
layout_bbox_list = [] layout_bbox_list = []
blocks_bbox_list = []
dropped_bbox_list = [] dropped_bbox_list = []
tables_list, tables_body_list, tables_caption_list, tables_footnote_list = ( tables_list, tables_body_list, tables_caption_list, tables_footnote_list = [], [], [], []
[],
[],
[],
[],
)
imgs_list, imgs_body_list, imgs_caption_list = [], [], [] imgs_list, imgs_body_list, imgs_caption_list = [], [], []
titles_list = [] titles_list = []
texts_list = [] texts_list = []
...@@ -80,7 +74,6 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path): ...@@ -80,7 +74,6 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
for page in pdf_info: for page in pdf_info:
page_layout_list = [] page_layout_list = []
page_dropped_list = [] page_dropped_list = []
page_blocks_bbox_list = []
tables, tables_body, tables_caption, tables_footnote = [], [], [], [] tables, tables_body, tables_caption, tables_footnote = [], [], [], []
imgs, imgs_body, imgs_caption = [], [], [] imgs, imgs_body, imgs_caption = [], [], []
titles = [] titles = []
...@@ -154,12 +147,22 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path): ...@@ -154,12 +147,22 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
interline_equation_list = [] interline_equation_list = []
image_list = [] image_list = []
table_list = [] table_list = []
dropped_list = []
for page in pdf_info: for page in pdf_info:
page_text_list = [] page_text_list = []
page_inline_equation_list = [] page_inline_equation_list = []
page_interline_equation_list = [] page_interline_equation_list = []
page_image_list = [] page_image_list = []
page_table_list = [] page_table_list = []
page_dropped_list = []
# 构造dropped_list
for block in page["discarded_blocks"]:
if block["type"] == BlockType.Discarded:
for line in block["lines"]:
for span in line["spans"]:
page_dropped_list.append(span["bbox"])
dropped_list.append(page_dropped_list)
# 构造其余useful_list
for block in page["para_blocks"]: for block in page["para_blocks"]:
if block["type"] in [ if block["type"] in [
BlockType.Text, BlockType.Text,
...@@ -205,6 +208,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path): ...@@ -205,6 +208,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False) draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
draw_bbox_without_number(i, image_list, page, [255, 204, 0], False) draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
draw_bbox_without_number(i, table_list, page, [204, 0, 255], False) draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
# Save the PDF # Save the PDF
pdf_docs.save(f"{out_path}/spans.pdf") pdf_docs.save(f"{out_path}/spans.pdf")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment