Commit 34f89650 authored by myhloli's avatar myhloli

refactor(draw_bbox): add line sorting visualization

Add a new function `draw_line_sort_bbox` to visualize the sorting of lines on each page.
This includes indexing lines and handling both text and non-text elements such as tables
and images for better content organization.

Also, comment out GPU-related code for flexibility and remove overlaps in bounding box
detection, which improves the accuracy of layout splitting.
parent 1efebe42
......@@ -334,7 +334,7 @@ def do_predict(boxes: List[List[int]]) -> List[int]:
return parse_logits(logits, len(boxes))
def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
layout_bbox_list = []
from loguru import logger
......@@ -344,35 +344,30 @@ def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
for line in block['lines']:
bbox = line['bbox']
page_line_list.append(bbox)
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
if block['type'] == 'table' or block['type'] == 'image':
bbox = block['bbox']
page_line_list.append(bbox)
# 使用layoutreader排序
page_size = page['page_size']
x_scale = 1000.0 / page_size[0]
y_scale = 1000.0 / page_size[1]
boxes = []
logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}")
for left, top, right, bottom in page_line_list:
left = round(left * x_scale)
top = round(top * y_scale)
right = round(right * x_scale)
bottom = round(bottom * y_scale)
assert (
1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
), f"Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}"
boxes.append([left, top, right, bottom])
logger.info("layoutreader start")
start = time.time()
orders = do_predict(boxes)
if torch.cuda.is_available():
torch.cuda.empty_cache()
print(orders)
logger.info(f"layoutreader end, cos time{time.time() - start}")
sorted_bboxes = [page_line_list[i] for i in orders]
layout_bbox_list.append(sorted_bboxes)
index = block['index']
page_line_list.append({'index': index, 'bbox': bbox})
sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
pdf_docs = fitz.open('pdf', pdf_bytes)
for i, page in enumerate(pdf_docs):
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
pdf_docs.save(f'{out_path}/{filename}_line_sort.pdf')
def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
layout_bbox_list = []
for page in pdf_info:
page_block_list = []
for block in page['para_blocks']:
bbox = block['bbox']
page_block_list.append(bbox)
layout_bbox_list.append(page_block_list)
pdf_docs = fitz.open('pdf', pdf_bytes)
for i, page in enumerate(pdf_docs):
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
......
......@@ -99,7 +99,7 @@ def do_predict(boxes: List[List[int]]) -> List[int]:
from transformers import LayoutLMv3ForTokenClassification
from magic_pdf.v3.helpers import prepare_inputs, boxes2inputs, parse_logits
model = LayoutLMv3ForTokenClassification.from_pretrained("hantian/layoutreader")
model.to("cuda")
# model.to("cuda")
inputs = boxes2inputs(boxes)
inputs = prepare_inputs(inputs, model)
logits = model(**inputs).logits.cpu().squeeze(0)
......@@ -145,17 +145,17 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
# interline_equation_blocks参数不够准,后面切换到interline_equations上
interline_equation_blocks = []
if len(interline_equation_blocks) > 0:
all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split_v2(
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
interline_equation_blocks, page_w, page_h)
else:
all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split_v2(
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
interline_equations, page_w, page_h)
if len(drop_reasons) > 0:
need_drop = True
drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
# if len(drop_reasons) > 0:
# need_drop = True
# drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
'''先处理不需要排版的discarded_blocks'''
discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)
......@@ -208,20 +208,31 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
sorted_bboxes = [page_line_list[i] for i in orders]
'''根据line的中位数算block的序列关系'''
for line_index, bbox in enumerate(sorted_bboxes):
for block in fix_blocks:
if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
line_index_list = []
block_without_lines = []
for block in fix_blocks:
if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
line_index_list = []
if len(block['lines']) == 0:
block_without_lines.append(block)
continue
else:
for line in block['lines']:
if line['bbox'] == bbox:
line['index'] = line_index
line_index_list.append(line_index)
# for line_bbox in sorted_bboxes:
# if line['bbox'] == line_bbox:
line['index'] = sorted_bboxes.index(line['bbox'])
line_index_list.append(line['index'])
median_value = statistics.median(line_index_list)
block['index'] = median_value
elif block['type'] == 'table' or block['type'] == 'image':
if block['bbox'] == bbox:
block['index'] = line_index
elif block['type'] == 'table' or block['type'] == 'image':
# for line_bbox in sorted_bboxes:
# if block['bbox'] == line_bbox:
block['index'] = sorted_bboxes.index(block['bbox'])
'''移除没有line的block'''
for block in block_without_lines:
fix_blocks.remove(block)
'''重排block'''
sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
......@@ -292,7 +303,9 @@ def pdf_parse_union(pdf_bytes,
pdf_info_dict[f"page_{page_id}"] = page_info
"""分段"""
para_split(pdf_info_dict, debug_mode=debug_mode)
# para_split(pdf_info_dict, debug_mode=debug_mode)
for page_num, page in pdf_info_dict.items():
page['para_blocks'] = page['preproc_blocks']
"""dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict)
......
......@@ -108,9 +108,9 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
return all_bboxes, all_discarded_blocks, drop_reasons
return all_bboxes, all_discarded_blocks
def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
......
......@@ -7,7 +7,7 @@ from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
draw_model_bbox, draw_layout_sort_bbox)
draw_model_bbox, draw_layout_sort_bbox, draw_line_sort_bbox)
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
......@@ -94,6 +94,8 @@ def do_parse(
draw_layout_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
md_content = pipe.pipe_mk_markdown(image_dir,
drop_mode=DropMode.NONE,
md_make_mode=f_make_md_mode)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment