refactor(draw_bbox): add line sorting visualization

Add a new function `draw_line_sort_bbox` to visualize the sorting of lines on each page. This includes indexing lines and handling both text and non-text elements such as tables and images for better content organization. Also, comment out GPU-related code for flexibility and remove overlaps in bounding box detection, which improves the accuracy of layout splitting.

refactor(draw_bbox): add line sorting visualization
Add a new function `draw_line_sort_bbox` to visualize the sorting of lines on each page. This includes indexing lines and handling both text and non-text elements such as tables and images for better content organization. Also, comment out GPU-related code for flexibility and remove overlaps in bounding box detection, which improves the accuracy of layout splitting.
34f89650 · myhloli · 1efebe42 · 34f89650 · 34f89650 · 34f89650
Commit 34f89650 authored Sep 27, 2024 by myhloli
4 changed files
--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
@@ -334,7 +334,7 @@ def do_predict(boxes: List[List[int]]) -> List[int]:
    return parse_logits(logits, len(boxes))


-def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
+def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
    layout_bbox_list = []

    from loguru import logger
@@ -344,35 +344,30 @@ def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
            if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
                for line in block['lines']:
                    bbox = line['bbox']
-                    page_line_list.append(bbox)
+                    index = line['index']
+                    page_line_list.append({'index': index, 'bbox': bbox})
            if block['type'] == 'table' or block['type'] == 'image':
                bbox = block['bbox']
-                page_line_list.append(bbox)
-
-        # 使用layoutreader排序
-        page_size = page['page_size']
-        x_scale = 1000.0 / page_size[0]
-        y_scale = 1000.0 / page_size[1]
-        boxes = []
-        logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}")
-        for left, top, right, bottom in page_line_list:
-            left = round(left * x_scale)
-            top = round(top * y_scale)
-            right = round(right * x_scale)
-            bottom = round(bottom * y_scale)
-            assert (
-                    1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
-            ), f"Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}"
-            boxes.append([left, top, right, bottom])
-        logger.info("layoutreader start")
-        start = time.time()
-        orders = do_predict(boxes)
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        print(orders)
-        logger.info(f"layoutreader end, cos time{time.time() - start}")
-        sorted_bboxes = [page_line_list[i] for i in orders]
-        layout_bbox_list.append(sorted_bboxes)
+                index = block['index']
+                page_line_list.append({'index': index, 'bbox': bbox})
+        sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
+        layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
+    pdf_docs = fitz.open('pdf', pdf_bytes)
+    for i, page in enumerate(pdf_docs):
+        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
+
+    pdf_docs.save(f'{out_path}/{filename}_line_sort.pdf')
+
+
+def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
+    layout_bbox_list = []
+
+    for page in pdf_info:
+        page_block_list = []
+        for block in page['para_blocks']:
+            bbox = block['bbox']
+            page_block_list.append(bbox)
+        layout_bbox_list.append(page_block_list)
    pdf_docs = fitz.open('pdf', pdf_bytes)
    for i, page in enumerate(pdf_docs):
        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)

--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -99,7 +99,7 @@ def do_predict(boxes: List[List[int]]) -> List[int]:
    from transformers import LayoutLMv3ForTokenClassification
    from magic_pdf.v3.helpers import prepare_inputs, boxes2inputs, parse_logits
    model = LayoutLMv3ForTokenClassification.from_pretrained("hantian/layoutreader")
-    model.to("cuda")
+    # model.to("cuda")
    inputs = boxes2inputs(boxes)
    inputs = prepare_inputs(inputs, model)
    logits = model(**inputs).logits.cpu().squeeze(0)
@@ -145,17 +145,17 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
    # interline_equation_blocks参数不够准，后面切换到interline_equations上
    interline_equation_blocks = []
    if len(interline_equation_blocks) > 0:
-        all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split_v2(
+        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
            img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
            interline_equation_blocks, page_w, page_h)
    else:
-        all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split_v2(
+        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
            img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
            interline_equations, page_w, page_h)

-    if len(drop_reasons) > 0:
-        need_drop = True
-        drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
+    # if len(drop_reasons) > 0:
+    #     need_drop = True
+    #     drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)

    '''先处理不需要排版的discarded_blocks'''
    discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)
@@ -208,20 +208,31 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
    sorted_bboxes = [page_line_list[i] for i in orders]

    '''根据line的中位数算block的序列关系'''
-    for line_index, bbox in enumerate(sorted_bboxes):
-        for block in fix_blocks:
-            if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
-                line_index_list = []
+    block_without_lines = []
+    for block in fix_blocks:
+        if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
+            line_index_list = []
+            if len(block['lines']) == 0:
+                block_without_lines.append(block)
+                continue
+            else:
                for line in block['lines']:
-                    if line['bbox'] == bbox:
-                        line['index'] = line_index
-                        line_index_list.append(line_index)
+                    # for line_bbox in sorted_bboxes:
+                    #     if line['bbox'] == line_bbox:
+                    line['index'] = sorted_bboxes.index(line['bbox'])
+                    line_index_list.append(line['index'])
                median_value = statistics.median(line_index_list)
                block['index'] = median_value

-            elif block['type'] == 'table' or block['type'] == 'image':
-                if block['bbox'] == bbox:
-                    block['index'] = line_index
+        elif block['type'] == 'table' or block['type'] == 'image':
+            # for line_bbox in sorted_bboxes:
+            #     if block['bbox'] == line_bbox:
+            block['index'] = sorted_bboxes.index(block['bbox'])
+
+    '''移除没有line的block'''
+    for block in block_without_lines:
+        fix_blocks.remove(block)
+
    '''重排block'''
    sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])

@@ -292,7 +303,9 @@ def pdf_parse_union(pdf_bytes,
        pdf_info_dict[f"page_{page_id}"] = page_info

    """分段"""
-    para_split(pdf_info_dict, debug_mode=debug_mode)
+    # para_split(pdf_info_dict, debug_mode=debug_mode)
+    for page_num, page in pdf_info_dict.items():
+        page['para_blocks'] = page['preproc_blocks']

    """dict转list"""
    pdf_info_list = dict_to_list(pdf_info_dict)

--- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
@@ -108,9 +108,9 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
    '''将剩余的bbox做分离处理，防止后面分layout时出错'''
-    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
+    # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)

-    return all_bboxes, all_discarded_blocks, drop_reasons
+    return all_bboxes, all_discarded_blocks


 def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):

--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -7,7 +7,7 @@ from loguru import logger

 import magic_pdf.model as model_config
 from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
-                                      draw_model_bbox, draw_layout_sort_bbox)
+                                      draw_model_bbox, draw_layout_sort_bbox, draw_line_sort_bbox)
 from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.pipe.OCRPipe import OCRPipe
 from magic_pdf.pipe.TXTPipe import TXTPipe
@@ -94,6 +94,8 @@ def do_parse(

    draw_layout_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)

+    draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
+
    md_content = pipe.pipe_mk_markdown(image_dir,
                                       drop_mode=DropMode.NONE,
                                       md_make_mode=f_make_md_mode)