Unverified Commit 10a95bcd authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #114 from papayalove/master

修复list拼接和reference分行问题
parents 3711a333 dbdbaf58
from sklearn.cluster import DBSCAN from sklearn.cluster import DBSCAN
import numpy as np import numpy as np
from loguru import logger from loguru import logger
import re
from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
from magic_pdf.libs.ocr_content_type import ContentType, BlockType from magic_pdf.libs.ocr_content_type import ContentType, BlockType
from magic_pdf.model.magic_model import MagicModel from magic_pdf.model.magic_model import MagicModel
...@@ -106,16 +106,18 @@ def __detect_list_lines(lines, new_layout_bboxes, lang): ...@@ -106,16 +106,18 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
3. 如果非顶格,首字符大写,编码为2 3. 如果非顶格,首字符大写,编码为2
4. 如果非顶格,首字符非大写编码为3 4. 如果非顶格,首字符非大写编码为3
""" """
x_map_tag_dict, min_x_tag = cluster_line_x(lines)
for l in lines: for l in lines:
first_char = __get_span_text(l['spans'][0])[0] span_text = __get_span_text(l['spans'][0])
first_char = span_text[0]
layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes) layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
if not layout: if not layout:
line_fea_encode.append(0) line_fea_encode.append(0)
else: else:
layout_left = layout[0] #
if l['bbox'][0] == layout_left: if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum(): # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
if not first_char.isalnum(): if not first_char.isalnum() or if_match_reference_list(span_text):
line_fea_encode.append(1) line_fea_encode.append(1)
else: else:
line_fea_encode.append(4) line_fea_encode.append(4)
...@@ -144,6 +146,36 @@ def __detect_list_lines(lines, new_layout_bboxes, lang): ...@@ -144,6 +146,36 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
return split_indices(total_lines, list_indice), list_start_idx return split_indices(total_lines, list_indice), list_start_idx
def cluster_line_x(lines: list) -> dict:
"""
对一个block内所有lines的bbox的x0聚类
"""
min_distance = 5
min_sample = 1
x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
x0_uniq_label = np.unique(x0_clusters.labels_)
#x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
x0_2_new_val = {} # 存储旧值对应的新值映射
min_x0 = round(lines[0]["bbox"][0])
for label in x0_uniq_label:
if label == -1:
continue
x0_index_of_label = np.where(x0_clusters.labels_ == label)
x0_raw_val = x0_lst[x0_index_of_label][:, 0]
x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
x0_2_new_val.update({round(raw_val): round(x0_new_val) for raw_val in x0_raw_val})
if x0_new_val < min_x0:
min_x0 = x0_new_val
return x0_2_new_val, min_x0
def if_match_reference_list(text: str) -> bool:
pattern = re.compile(r'^\d+\..*')
if pattern.match(text):
return True
else:
return False
def __valign_lines(blocks, layout_bboxes): def __valign_lines(blocks, layout_bboxes):
""" """
...@@ -315,10 +347,11 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"): ...@@ -315,10 +347,11 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
""" """
for list_start in list_start_line: for list_start in list_start_line:
if len(list_start) > 1: if len(list_start) > 1:
for i in range(1, len(list_start)): for i in range(0, len(list_start)):
index = list_start[i] - 1 index = list_start[i] - 1
if "content" in lines[index]["spans"][-1]: if index >= 0:
lines[index]["spans"][-1]["content"] += '\n\n' if "content" in lines[index]["spans"][-1]:
lines[index]["spans"][-1]["content"] += '\n\n'
layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾 layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
for content_type, start, end in text_segments: for content_type, start, end in text_segments:
if content_type == 'list': if content_type == 'list':
...@@ -388,20 +421,17 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info, ...@@ -388,20 +421,17 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
logger.info(f"连接page {page_num} 内的list") logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行 # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines = [] may_list_lines = []
for j in range(len(next_paras)): lines = next_first_para.get("lines", [])
lines = next_paras[j].get("lines", [])
if len(lines) == 1: # 只可能是一行,多行情况再需要分析了 for line in lines:
if lines[0]['bbox'][0] > __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]: if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], new_layout_bbox)[0]:
may_list_lines.append(lines[0]) may_list_lines.append(line)
else:
break
else: else:
break break
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。 # 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1: if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
pre_last_para.extend(may_list_lines) pre_last_para.extend(may_list_lines)
blocks_group[i] = blocks_group[i][len(may_list_lines):] next_first_para["lines"] = next_first_para["lines"][len(may_list_lines):]
# layout_paras[i] = layout_paras[i][len(may_list_lines):]
return blocks_group, [layout_list_info[0][0], layout_list_info[-1][1]] # 同时还返回了这个页面级别的开头、结尾是不是列表的信息 return blocks_group, [layout_list_info[0][0], layout_list_info[-1][1]] # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
...@@ -422,18 +452,14 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b ...@@ -422,18 +452,14 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
logger.info(f"连接page {page_num} 内的list") logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行 # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines = [] may_list_lines = []
for j in range(len(next_page_paras[0])): next_page_first_para = next_page_paras[0][0]
next_page_block_j = next_page_paras[0][j] if next_page_first_para["type"] == BlockType.Text:
if next_page_block_j["type"] != BlockType.Text: lines = next_page_first_para["lines"]
break for line in lines:
lines = next_page_block_j["lines"] if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], next_page_layout_bbox)[0]:
if len(lines) == 1: # 只可能是一行,多行情况再需要分析了 may_list_lines.append(line)
if lines[0]['bbox'][0] > __find_layout_bbox_by_line(lines[0]['bbox'], next_page_layout_bbox)[0]:
may_list_lines.append(lines[0])
else: else:
break break
else:
break
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。 # 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1: if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
#pre_page_paras[-1].append(may_list_lines) #pre_page_paras[-1].append(may_list_lines)
...@@ -442,7 +468,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b ...@@ -442,7 +468,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
for span in line["spans"]: for span in line["spans"]:
span[CROSS_PAGE] = True span[CROSS_PAGE] = True
pre_page_paras[-1][-1]["lines"].extend(may_list_lines) pre_page_paras[-1][-1]["lines"].extend(may_list_lines)
next_page_paras[0] = next_page_paras[0][len(may_list_lines):] next_page_first_para["lines"] = next_page_first_para["lines"][len(may_list_lines):]
return True return True
return False return False
...@@ -471,7 +497,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox): ...@@ -471,7 +497,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
if len(blocks_group) == 0: if len(blocks_group) == 0:
return connected_layout_blocks return connected_layout_blocks
#connected_layout_paras.append(layout_paras[0])
connected_layout_blocks.append(blocks_group[0]) connected_layout_blocks.append(blocks_group[0])
for i in range(1, len(blocks_group)): for i in range(1, len(blocks_group)):
try: try:
...@@ -484,6 +509,9 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox): ...@@ -484,6 +509,9 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text: if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text:
connected_layout_blocks.append(blocks_group[i]) connected_layout_blocks.append(blocks_group[i])
continue continue
if len(blocks_group[i - 1][-1]["lines"]) == 0 or len(blocks_group[i][0]["lines"]) == 0:
connected_layout_blocks.append(blocks_group[i])
continue
pre_last_line = blocks_group[i - 1][-1]["lines"][-1] pre_last_line = blocks_group[i - 1][-1]["lines"][-1]
next_first_line = blocks_group[i][0]["lines"][0] next_first_line = blocks_group[i][0]["lines"][0]
except Exception as e: except Exception as e:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment