Commit 49076f02 authored by 赵小蒙's avatar 赵小蒙

fix draw_span_bbox logic

parent 3457256f
...@@ -28,7 +28,7 @@ import click ...@@ -28,7 +28,7 @@ import click
from loguru import logger from loguru import logger
from pathlib import Path from pathlib import Path
from magic_pdf.libs.draw_bbox import draw_layout_bbox from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe from magic_pdf.pipe.TXTPipe import TXTPipe
...@@ -73,6 +73,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, ...@@ -73,6 +73,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
pipe.pipe_parse() pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data['pdf_info'] pdf_info = pipe.pdf_mid_data['pdf_info']
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir) draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
md_content = pipe.pipe_mk_markdown() md_content = pipe.pipe_mk_markdown()
#part_file_name = datetime.now().strftime("%H-%M-%S") #part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer.write( md_writer.write(
......
from magic_pdf.libs.commons import fitz # PyMuPDF from magic_pdf.libs.commons import fitz # PyMuPDF
from magic_pdf.libs.ocr_content_type import ContentType from magic_pdf.libs.ocr_content_type import ContentType, BlockType
def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config): def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
...@@ -58,32 +58,59 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path): ...@@ -58,32 +58,59 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
# Save the PDF # Save the PDF
pdf_docs.save(f"{out_path}/layout.pdf") pdf_docs.save(f"{out_path}/layout.pdf")
def draw_text_bbox(pdf_info_dict, pdf_bytes, out_path): def draw_span_bbox(pdf_info, pdf_bytes, out_path):
text_list = [] text_list = []
inline_equation_list = [] inline_equation_list = []
interline_equation_list = [] interline_equation_list = []
for page in pdf_info_dict.values(): image_list = []
table_list = []
for page in pdf_info:
page_text_list = [] page_text_list = []
page_inline_equation_list = [] page_inline_equation_list = []
page_interline_equation_list = [] page_interline_equation_list = []
page_image_list = []
page_table_list = []
for block in page['para_blocks']: for block in page['para_blocks']:
for line in block['lines']: if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
for span in line['spans']: for line in block['lines']:
if span['type'] == ContentType.Text: for span in line['spans']:
page_text_list.append(span['bbox']) if span['type'] == ContentType.Text:
elif span['type'] == ContentType.InlineEquation: page_text_list.append(span['bbox'])
page_inline_equation_list.append(span['bbox']) elif span['type'] == ContentType.InlineEquation:
elif span['type'] == ContentType.InterlineEquation: page_inline_equation_list.append(span['bbox'])
page_interline_equation_list.append(span['bbox']) elif span['type'] == ContentType.InterlineEquation:
page_interline_equation_list.append(span['bbox'])
elif span['type'] == ContentType.Image:
page_image_list.append(span['bbox'])
elif span['type'] == ContentType.Table:
page_table_list.append(span['bbox'])
elif block['type'] in [BlockType.Image, BlockType.Table]:
for sub_block in block["blocks"]:
for line in sub_block['lines']:
for span in line['spans']:
if span['type'] == ContentType.Text:
page_text_list.append(span['bbox'])
elif span['type'] == ContentType.InlineEquation:
page_inline_equation_list.append(span['bbox'])
elif span['type'] == ContentType.InterlineEquation:
page_interline_equation_list.append(span['bbox'])
elif span['type'] == ContentType.Image:
page_image_list.append(span['bbox'])
elif span['type'] == ContentType.Table:
page_table_list.append(span['bbox'])
text_list.append(page_text_list) text_list.append(page_text_list)
inline_equation_list.append(page_inline_equation_list) inline_equation_list.append(page_inline_equation_list)
interline_equation_list.append(page_interline_equation_list) interline_equation_list.append(page_interline_equation_list)
image_list.append(page_image_list)
table_list.append(page_table_list)
pdf_docs = fitz.open("pdf", pdf_bytes) pdf_docs = fitz.open("pdf", pdf_bytes)
for i, page in enumerate(pdf_docs): for i, page in enumerate(pdf_docs):
# 获取当前页面的数据 # 获取当前页面的数据
draw_bbox_without_number(i, text_list, page, [255, 0, 0]) draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0]) draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255]) draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
# Save the PDF # Save the PDF
pdf_docs.save(f"{out_path}/text.pdf") pdf_docs.save(f"{out_path}/spans.pdf")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment