Commit 34f89650 authored by myhloli's avatar myhloli

refactor(draw_bbox): add line sorting visualization

Add a new function `draw_line_sort_bbox` to visualize the sorting of lines on each page.
This includes indexing lines and handling both text and non-text elements such as tables
and images for better content organization.

Also, comment out GPU-related code for flexibility and remove overlaps in bounding box
detection, which improves the accuracy of layout splitting.
parent 1efebe42
...@@ -334,7 +334,7 @@ def do_predict(boxes: List[List[int]]) -> List[int]: ...@@ -334,7 +334,7 @@ def do_predict(boxes: List[List[int]]) -> List[int]:
return parse_logits(logits, len(boxes)) return parse_logits(logits, len(boxes))
def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename): def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
layout_bbox_list = [] layout_bbox_list = []
from loguru import logger from loguru import logger
...@@ -344,35 +344,30 @@ def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -344,35 +344,30 @@ def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation': if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
for line in block['lines']: for line in block['lines']:
bbox = line['bbox'] bbox = line['bbox']
page_line_list.append(bbox) index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
if block['type'] == 'table' or block['type'] == 'image': if block['type'] == 'table' or block['type'] == 'image':
bbox = block['bbox'] bbox = block['bbox']
page_line_list.append(bbox) index = block['index']
page_line_list.append({'index': index, 'bbox': bbox})
# 使用layoutreader排序 sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
page_size = page['page_size'] layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
x_scale = 1000.0 / page_size[0] pdf_docs = fitz.open('pdf', pdf_bytes)
y_scale = 1000.0 / page_size[1] for i, page in enumerate(pdf_docs):
boxes = [] draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}")
for left, top, right, bottom in page_line_list: pdf_docs.save(f'{out_path}/{filename}_line_sort.pdf')
left = round(left * x_scale)
top = round(top * y_scale)
right = round(right * x_scale) def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
bottom = round(bottom * y_scale) layout_bbox_list = []
assert (
1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0 for page in pdf_info:
), f"Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}" page_block_list = []
boxes.append([left, top, right, bottom]) for block in page['para_blocks']:
logger.info("layoutreader start") bbox = block['bbox']
start = time.time() page_block_list.append(bbox)
orders = do_predict(boxes) layout_bbox_list.append(page_block_list)
if torch.cuda.is_available():
torch.cuda.empty_cache()
print(orders)
logger.info(f"layoutreader end, cos time{time.time() - start}")
sorted_bboxes = [page_line_list[i] for i in orders]
layout_bbox_list.append(sorted_bboxes)
pdf_docs = fitz.open('pdf', pdf_bytes) pdf_docs = fitz.open('pdf', pdf_bytes)
for i, page in enumerate(pdf_docs): for i, page in enumerate(pdf_docs):
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False) draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
......
...@@ -99,7 +99,7 @@ def do_predict(boxes: List[List[int]]) -> List[int]: ...@@ -99,7 +99,7 @@ def do_predict(boxes: List[List[int]]) -> List[int]:
from transformers import LayoutLMv3ForTokenClassification from transformers import LayoutLMv3ForTokenClassification
from magic_pdf.v3.helpers import prepare_inputs, boxes2inputs, parse_logits from magic_pdf.v3.helpers import prepare_inputs, boxes2inputs, parse_logits
model = LayoutLMv3ForTokenClassification.from_pretrained("hantian/layoutreader") model = LayoutLMv3ForTokenClassification.from_pretrained("hantian/layoutreader")
model.to("cuda") # model.to("cuda")
inputs = boxes2inputs(boxes) inputs = boxes2inputs(boxes)
inputs = prepare_inputs(inputs, model) inputs = prepare_inputs(inputs, model)
logits = model(**inputs).logits.cpu().squeeze(0) logits = model(**inputs).logits.cpu().squeeze(0)
...@@ -145,17 +145,17 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, ...@@ -145,17 +145,17 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
# interline_equation_blocks参数不够准,后面切换到interline_equations上 # interline_equation_blocks参数不够准,后面切换到interline_equations上
interline_equation_blocks = [] interline_equation_blocks = []
if len(interline_equation_blocks) > 0: if len(interline_equation_blocks) > 0:
all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split_v2( all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks, img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
interline_equation_blocks, page_w, page_h) interline_equation_blocks, page_w, page_h)
else: else:
all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split_v2( all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks, img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
interline_equations, page_w, page_h) interline_equations, page_w, page_h)
if len(drop_reasons) > 0: # if len(drop_reasons) > 0:
need_drop = True # need_drop = True
drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION) # drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
'''先处理不需要排版的discarded_blocks''' '''先处理不需要排版的discarded_blocks'''
discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4) discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)
...@@ -208,20 +208,31 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, ...@@ -208,20 +208,31 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
sorted_bboxes = [page_line_list[i] for i in orders] sorted_bboxes = [page_line_list[i] for i in orders]
'''根据line的中位数算block的序列关系''' '''根据line的中位数算block的序列关系'''
for line_index, bbox in enumerate(sorted_bboxes): block_without_lines = []
for block in fix_blocks: for block in fix_blocks:
if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation': if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
line_index_list = [] line_index_list = []
if len(block['lines']) == 0:
block_without_lines.append(block)
continue
else:
for line in block['lines']: for line in block['lines']:
if line['bbox'] == bbox: # for line_bbox in sorted_bboxes:
line['index'] = line_index # if line['bbox'] == line_bbox:
line_index_list.append(line_index) line['index'] = sorted_bboxes.index(line['bbox'])
line_index_list.append(line['index'])
median_value = statistics.median(line_index_list) median_value = statistics.median(line_index_list)
block['index'] = median_value block['index'] = median_value
elif block['type'] == 'table' or block['type'] == 'image': elif block['type'] == 'table' or block['type'] == 'image':
if block['bbox'] == bbox: # for line_bbox in sorted_bboxes:
block['index'] = line_index # if block['bbox'] == line_bbox:
block['index'] = sorted_bboxes.index(block['bbox'])
'''移除没有line的block'''
for block in block_without_lines:
fix_blocks.remove(block)
'''重排block''' '''重排block'''
sorted_blocks = sorted(fix_blocks, key=lambda b: b['index']) sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
...@@ -292,7 +303,9 @@ def pdf_parse_union(pdf_bytes, ...@@ -292,7 +303,9 @@ def pdf_parse_union(pdf_bytes,
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f"page_{page_id}"] = page_info
"""分段""" """分段"""
para_split(pdf_info_dict, debug_mode=debug_mode) # para_split(pdf_info_dict, debug_mode=debug_mode)
for page_num, page in pdf_info_dict.items():
page['para_blocks'] = page['preproc_blocks']
"""dict转list""" """dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict) pdf_info_list = dict_to_list(pdf_info_dict)
......
...@@ -108,9 +108,9 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b ...@@ -108,9 +108,9 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
all_bboxes = remove_overlaps_min_blocks(all_bboxes) all_bboxes = remove_overlaps_min_blocks(all_bboxes)
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks) all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
'''将剩余的bbox做分离处理,防止后面分layout时出错''' '''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes) # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
return all_bboxes, all_discarded_blocks, drop_reasons return all_bboxes, all_discarded_blocks
def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes): def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
......
...@@ -7,7 +7,7 @@ from loguru import logger ...@@ -7,7 +7,7 @@ from loguru import logger
import magic_pdf.model as model_config import magic_pdf.model as model_config
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox, from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
draw_model_bbox, draw_layout_sort_bbox) draw_model_bbox, draw_layout_sort_bbox, draw_line_sort_bbox)
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe from magic_pdf.pipe.TXTPipe import TXTPipe
...@@ -94,6 +94,8 @@ def do_parse( ...@@ -94,6 +94,8 @@ def do_parse(
draw_layout_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name) draw_layout_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
md_content = pipe.pipe_mk_markdown(image_dir, md_content = pipe.pipe_mk_markdown(image_dir,
drop_mode=DropMode.NONE, drop_mode=DropMode.NONE,
md_make_mode=f_make_md_mode) md_make_mode=f_make_md_mode)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment