Commit 564c4ce1 authored by myhloli's avatar myhloli

refactor(magic_pdf): improve line sorting and block indexing

- Insert lines into blocks based on median line height- Calculate block index using line indices median
- Remove virtual line information for table and image blocks
- Enhance line sorting algorithm for different block types
- Add line height calculation function
parent 4c9bf8ab
......@@ -341,6 +341,10 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
bbox = block['bbox']
index = block['index']
page_line_list.append({'index': index, 'bbox': bbox})
# for line in block['lines']:
# bbox = line['bbox']
# index = line['index']
# page_line_list.append({'index': index, 'bbox': bbox})
sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
pdf_docs = fitz.open('pdf', pdf_bytes)
......
......@@ -150,37 +150,99 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
def cal_block_index(fix_blocks, sorted_bboxes):
for block in fix_blocks:
if block['type'] in ['text', 'title', 'interline_equation']:
line_index_list = []
if len(block['lines']) == 0:
block['index'] = sorted_bboxes.index(block['bbox'])
else:
for line in block['lines']:
line['index'] = sorted_bboxes.index(line['bbox'])
line_index_list.append(line['index'])
median_value = statistics.median(line_index_list)
block['index'] = median_value
elif block['type'] in ['table', 'image']:
# if block['type'] in ['text', 'title', 'interline_equation']:
# line_index_list = []
# if len(block['lines']) == 0:
# block['index'] = sorted_bboxes.index(block['bbox'])
# else:
# for line in block['lines']:
# line['index'] = sorted_bboxes.index(line['bbox'])
# line_index_list.append(line['index'])
# median_value = statistics.median(line_index_list)
# block['index'] = median_value
#
# elif block['type'] in ['table', 'image']:
# block['index'] = sorted_bboxes.index(block['bbox'])
line_index_list = []
if len(block['lines']) == 0:
block['index'] = sorted_bboxes.index(block['bbox'])
else:
for line in block['lines']:
line['index'] = sorted_bboxes.index(line['bbox'])
line_index_list.append(line['index'])
median_value = statistics.median(line_index_list)
block['index'] = median_value
# 删除图表block中的虚拟line信息
if block['type'] in ['table', 'image']:
del block['lines']
return fix_blocks
def sort_lines_by_model(fix_blocks, page_w, page_h):
def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
# block_bbox是一个元组(x0, y0, x1, y1),其中(x0, y0)是左下角坐标,(x1, y1)是右上角坐标
x0, y0, x1, y1 = block_bbox
block_height = y1 - y0
block_weight = x1 - x0
# 如果block高度小于n行正文,则直接返回block的bbox
if line_height*3 < block_height:
if block_height > page_h*0.25 and page_w*0.5 > block_weight > page_w*0.25: # 可能是双列结构,可以切细点
lines = int(block_height/line_height)
else:
# 如果block的宽度超过0.4页面宽度,则将block分成3行
if block_weight > page_w*0.4:
line_height = (y1 - y0) / 3
lines = 3
elif block_weight > page_w*0.25: # 否则将block分成两行
line_height = (y1 - y0) / 2
lines = 2
else: # 判断长宽比
if block_height/block_weight > 1.2: # 细长的不分
return [[x0, y0, x1, y1]]
else: # 不细长的还是分成两行
line_height = (y1 - y0) / 2
lines = 2
# 确定从哪个y位置开始绘制线条
current_y = y0
# 用于存储线条的位置信息[(x0, y), ...]
lines_positions = []
for i in range(lines):
lines_positions.append([x0, current_y, x1, current_y + line_height])
current_y += line_height
return lines_positions
else:
return [[x0, y0, x1, y1]]
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
page_line_list = []
for block in fix_blocks:
if block['type'] in ['text', 'title', 'interline_equation']:
if len(block['lines']) == 0: # 没有line的block(一般是图片形式的文本块),就直接用block的bbox来排序
if len(block['lines']) == 0:
bbox = block['bbox']
page_line_list.append(bbox)
lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
for line in lines:
block['lines'].append({'bbox': line, 'spans': []})
page_line_list.extend(lines)
else:
for line in block['lines']:
bbox = line['bbox']
page_line_list.append(bbox)
elif block['type'] in ['table', 'image']: # 简单的把表和图都当成一个line处理
elif block['type'] in ['table', 'image']:
bbox = block['bbox']
page_line_list.append(bbox)
lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
block['lines'] = []
for line in lines:
block['lines'].append({'bbox': line, 'spans': []})
page_line_list.extend(lines)
# 使用layoutreader排序
x_scale = 1000.0 / page_w
......@@ -222,6 +284,19 @@ def sort_lines_by_model(fix_blocks, page_w, page_h):
return sorted_bboxes
def get_line_height(blocks):
page_line_height_list = []
for block in blocks:
if block['type'] in ['text', 'title', 'interline_equation']:
for line in block['lines']:
bbox = line['bbox']
page_line_height_list.append(int(bbox[3]-bbox[1]))
if len(page_line_height_list) > 0:
return statistics.median(page_line_height_list)
else:
return 10
def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode):
need_drop = False
drop_reason = []
......@@ -286,8 +361,11 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
'''对block进行fix操作'''
fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
'''获取所有line并计算正文line的高度'''
line_height = get_line_height(fix_blocks)
'''获取所有line并对line排序'''
sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h)
sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height)
'''根据line的中位数算block的序列关系'''
fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment