Commit b2790f6f authored by myhloli's avatar myhloli

refactor(drawing): simplify draw bbox functions and adjust debug config

Refactor the draw bbox functions by removing unused imports and simplifying the
code logic for drawing layout and line sorting bounding boxes. Adjust the debug
configuration to enable content list dumping and disable markdown making mode.
parent 16b51c79
import time
import torch
from magic_pdf.libs.commons import fitz # PyMuPDF
from magic_pdf.libs.Constants import CROSS_PAGE
from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
......@@ -335,16 +331,15 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
layout_bbox_list = []
from loguru import logger
for page in pdf_info:
page_line_list = []
for block in page['preproc_blocks']:
if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
if block['type'] in ['text', 'title', 'interline_equation']:
for line in block['lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
if block['type'] == 'table' or block['type'] == 'image':
if block['type'] in ['table', 'image']:
bbox = block['bbox']
index = block['index']
page_line_list.append({'index': index, 'bbox': bbox})
......
......@@ -14,7 +14,6 @@ from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.local_math import float_equal
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.model.magic_model import MagicModel
from magic_pdf.para.para_split_v2 import para_split
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
......@@ -153,10 +152,6 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
interline_equations, page_w, page_h)
# if len(drop_reasons) > 0:
# need_drop = True
# drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
'''先处理不需要排版的discarded_blocks'''
discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)
fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
......@@ -177,11 +172,11 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
'''获取所有line并对line排序'''
page_line_list = []
for block in fix_blocks:
if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
if block['type'] in ['text', 'title', 'interline_equation']:
for line in block['lines']:
bbox = line['bbox']
page_line_list.append(bbox)
elif block['type'] == 'table' or block['type'] == 'image': # 简单的把表和图都当成一个line处理
elif block['type'] in ['table', 'image']: # 简单的把表和图都当成一个line处理
bbox = block['bbox']
page_line_list.append(bbox)
......@@ -201,32 +196,25 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
boxes.append([left, top, right, bottom])
layoutreader_start = time.time()
orders = do_predict(boxes)
# if torch.cuda.is_available():
# torch.cuda.empty_cache()
# print(orders)
logger.info(f"layoutreader cost time{time.time() - layoutreader_start}")
sorted_bboxes = [page_line_list[i] for i in orders]
'''根据line的中位数算block的序列关系'''
block_without_lines = []
for block in fix_blocks:
if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
if block['type'] in ['text', 'title', 'interline_equation']:
line_index_list = []
if len(block['lines']) == 0:
block_without_lines.append(block)
continue
else:
for line in block['lines']:
# for line_bbox in sorted_bboxes:
# if line['bbox'] == line_bbox:
line['index'] = sorted_bboxes.index(line['bbox'])
line_index_list.append(line['index'])
median_value = statistics.median(line_index_list)
block['index'] = median_value
elif block['type'] == 'table' or block['type'] == 'image':
# for line_bbox in sorted_bboxes:
# if block['bbox'] == line_bbox:
elif block['type'] in ['table', 'image']:
block['index'] = sorted_bboxes.index(block['bbox'])
'''移除没有line的block'''
......
......@@ -7,7 +7,7 @@ from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
draw_model_bbox, draw_layout_sort_bbox, draw_line_sort_bbox)
draw_model_bbox, draw_line_sort_bbox)
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
......@@ -39,7 +39,7 @@ def do_parse(
f_dump_middle_json=True,
f_dump_model_json=True,
f_dump_orig_pdf=True,
f_dump_content_list=False,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=False,
f_draw_line_sort_bbox=False,
......@@ -49,7 +49,7 @@ def do_parse(
):
if debug_able:
logger.warning('debug mode is on')
f_dump_content_list = True
# f_dump_content_list = True
f_draw_model_bbox = True
f_draw_line_sort_bbox = True
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment