Commit f36c2656 authored by kernel.h@qq.com's avatar kernel.h@qq.com

使用面积占比方式判断一行文本是不是在一个layoutbox里

parent a36ef4f8
...@@ -92,5 +92,5 @@ if __name__ == '__main__': ...@@ -92,5 +92,5 @@ if __name__ == '__main__':
ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json" ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf" # ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
# ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json" # ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
#ocr_local_parse(ocr_pdf_path, ocr_json_file_path) ocr_local_parse(ocr_pdf_path, ocr_json_file_path)
ocr_online_parse(book_name="美国加州中学教材/edu_00000060") #ocr_online_parse(book_name="美国加州中学教材/edu_00000060")
...@@ -71,9 +71,10 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict): ...@@ -71,9 +71,10 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict): def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
markdown = [] markdown = []
for _, page_info in pdf_info_dict.items(): for _, page_info in pdf_info_dict.items():
paras = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
if not paras: if not paras_of_layout:
continue continue
for paras in paras_of_layout:
for para in paras: for para in paras:
para_text = '' para_text = ''
for line in para: for line in para:
...@@ -81,12 +82,12 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict): ...@@ -81,12 +82,12 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
span_type = span.get('type') span_type = span.get('type')
if span_type == ContentType.Text: if span_type == ContentType.Text:
content = split_long_words(span['content']) content = split_long_words(span['content'])
# content = span['content'] pass
elif span_type == ContentType.InlineEquation: elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$" content = f" ${span['content']}$ "
elif span_type == ContentType.InterlineEquation: elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n" content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]: elif span_type in [ ContentType.Image, ContentType.Table ]:
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n" content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
para_text += content + ' ' para_text += content + ' '
markdown.append(para_text.strip() + ' ') markdown.append(para_text.strip() + ' ')
......
...@@ -18,6 +18,33 @@ def _is_in_or_part_overlap(box1, box2) -> bool: ...@@ -18,6 +18,33 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
y1_1 < y0_2 or # box1在box2的上边 y1_1 < y0_2 or # box1在box2的上边
y0_1 > y1_2) # box1在box2的下边 y0_1 > y1_2) # box1在box2的下边
def _is_in_or_part_overlap_with_area_ratio(box1, box2, area_ratio_threshold=0.6):
"""
判断box1是否在box2里面,或者box1和box2有部分重叠,且重叠面积占box1的比例超过area_ratio_threshold
"""
if box1 is None or box2 is None:
return False
x0_1, y0_1, x1_1, y1_1 = box1
x0_2, y0_2, x1_2, y1_2 = box2
if not _is_in_or_part_overlap(box1, box2):
return False
# 计算重叠面积
x_left = max(x0_1, x0_2)
y_top = max(y0_1, y0_2)
x_right = min(x1_1, x1_2)
y_bottom = min(y1_1, y1_2)
overlap_area = (x_right - x_left) * (y_bottom - y_top)
# 计算box1的面积
box1_area = (x1_1 - x0_1) * (y1_1 - y0_1)
return overlap_area / box1_area > area_ratio_threshold
def _is_in(box1, box2) -> bool: def _is_in(box1, box2) -> bool:
""" """
box1是否完全在box2里面 box1是否完全在box2里面
......
...@@ -2,7 +2,7 @@ from sklearn.cluster import DBSCAN ...@@ -2,7 +2,7 @@ from sklearn.cluster import DBSCAN
import numpy as np import numpy as np
from loguru import logger from loguru import logger
from magic_pdf.libs.boxbase import _is_in_or_part_overlap from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
from magic_pdf.libs.ocr_content_type import ContentType from magic_pdf.libs.ocr_content_type import ContentType
...@@ -20,44 +20,6 @@ def __get_span_text(span): ...@@ -20,44 +20,6 @@ def __get_span_text(span):
return c return c
def __add_line_period(blocks, layout_bboxes):
"""
为每行添加句号
如果这个行
1. 以行内公式结尾,但没有任何标点符号,此时加个句号,认为他就是段落结尾。
"""
for block in blocks:
for line in block['lines']:
last_span = line['spans'][-1]
span_type = last_span['type']
if span_type in [INLINE_EQUATION]:
span_content = last_span['content'].strip()
if span_type==INLINE_EQUATION and span_content[-1] not in LINE_STOP_FLAG:
if span_type in [INLINE_EQUATION, INTERLINE_EQUATION]:
last_span['content'] = span_content + '.'
def __detect_line_align_direction(line, new_layout_bboxes):
"""
探测line是左对齐,还是右对齐,还是居中。
"""
lbox = line['bbox']
x0, x1 = lbox[0], lbox[2]
layout_x0, layout_x1 = new_layout_bboxes[0], new_layout_bboxes[2]
if x0 <= layout_x0 and x1 < layout_x1:
return "left"
elif x0 > layout_x0 and x1 >= layout_x1:
return "right"
else:
return "center"
def __detect_line_group_align_direction(lines, new_layout_bboxes):
"""
首先把lines按照行距离分成几部分。针对每一部分分别探测。
最后返回[(dir, lines), (dir, lines), ...]
"""
pass
def __detect_list_lines(lines, new_layout_bboxes, lang='en'): def __detect_list_lines(lines, new_layout_bboxes, lang='en'):
""" """
探测是否包含了列表,并且把列表的行分开. 探测是否包含了列表,并且把列表的行分开.
...@@ -166,7 +128,7 @@ def __valign_lines(blocks, layout_bboxes): ...@@ -166,7 +128,7 @@ def __valign_lines(blocks, layout_bboxes):
new_layout_bboxes = [] new_layout_bboxes = []
for layout_box in layout_bboxes: for layout_box in layout_bboxes:
blocks_in_layoutbox = [b for b in blocks if _is_in_or_part_overlap(b['bbox'], layout_box['layout_bbox'])] blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], layout_box['layout_bbox'])]
if len(blocks_in_layoutbox)==0: if len(blocks_in_layoutbox)==0:
continue continue
...@@ -252,7 +214,7 @@ def __group_line_by_layout(blocks, layout_bboxes, lang="en"): ...@@ -252,7 +214,7 @@ def __group_line_by_layout(blocks, layout_bboxes, lang="en"):
lines_group = [] lines_group = []
for lyout in layout_bboxes: for lyout in layout_bboxes:
lines = [line for block in blocks if _is_in_or_part_overlap(block['bbox'], lyout['layout_bbox']) for line in block['lines']] lines = [line for block in blocks if is_in_layout(block['bbox'], lyout['layout_bbox']) for line in block['lines']]
lines_group.append(lines) lines_group.append(lines)
return lines_group return lines_group
...@@ -267,12 +229,19 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_ ...@@ -267,12 +229,19 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
且下一行开头不留空白。 且下一行开头不留空白。
""" """
line_group_end_with_list = [] # 这个layout最后是不是列表,用于跨layout列表合并 list_info = [] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
paras = [] layout_paras = []
right_tail_distance = 1.5 * char_avg_len right_tail_distance = 1.5 * char_avg_len
for lines in lines_group: for lines in lines_group:
paras = []
total_lines = len(lines) total_lines = len(lines)
if total_lines<=1: # 0行无需处理。1行无法分段。 if total_lines==0:
continue # 0行无需处理
if total_lines==1: # 1行无法分段。
layout_paras.append([lines])
list_info.append([False, False])
continue continue
"""在进入到真正的分段之前,要对文字块从统计维度进行对齐方式的探测, """在进入到真正的分段之前,要对文字块从统计维度进行对齐方式的探测,
...@@ -292,7 +261,7 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_ ...@@ -292,7 +261,7 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
layout_right = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[2] layout_right = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[2]
layout_left = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0] layout_left = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]
para = [] # 元素是line para = [] # 元素是line
is_lines_end_with_list = False layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
for content_type, start, end in text_segments: for content_type, start, end in text_segments:
if content_type == 'list': if content_type == 'list':
for i, line in enumerate(lines[start:end+1]): for i, line in enumerate(lines[start:end+1]):
...@@ -307,7 +276,10 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_ ...@@ -307,7 +276,10 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
if len(para)>0: if len(para)>0:
paras.append(para) paras.append(para)
para = [] para = []
is_lines_end_with_list = True if start==0:
layout_list_info[0] = True
if end==total_lines-1:
layout_list_info[1] = True
else: else:
for i, line in enumerate(lines[start:end+1]): for i, line in enumerate(lines[start:end+1]):
# 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断一下行结尾特征。 # 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断一下行结尾特征。
...@@ -335,12 +307,18 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_ ...@@ -335,12 +307,18 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
if len(para)>0: if len(para)>0:
paras.append(para) paras.append(para)
para = [] para = []
is_lines_end_with_list = False
line_group_end_with_list.append(is_lines_end_with_list) list_info.append(layout_list_info)
layout_paras.append(paras)
paras = []
return layout_paras, list_info
def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, lang="en"):
# TODO
return paras, line_group_end_with_list return layout_paras
def __find_layout_bbox_by_line(line_bbox, layout_bboxes): def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
...@@ -348,12 +326,12 @@ def __find_layout_bbox_by_line(line_bbox, layout_bboxes): ...@@ -348,12 +326,12 @@ def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
根据line找到所在的layout 根据line找到所在的layout
""" """
for layout in layout_bboxes: for layout in layout_bboxes:
if _is_in_or_part_overlap(line_bbox, layout): if is_in_layout(line_bbox, layout):
return layout return layout
return None return None
def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, line_group_end_with_list, lang="en"): def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang="en"):
""" """
layout之间进行分段。 layout之间进行分段。
主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。 主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
...@@ -363,21 +341,18 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, line_group_end ...@@ -363,21 +341,18 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, line_group_end
""" """
connected_layout_paras = [] connected_layout_paras = []
for i, para in enumerate(layout_paras): connected_layout_paras.append(layout_paras[0])
if i==0: for i in range(1, len(layout_paras)):
connected_layout_paras.append(para) pre_last_line = layout_paras[i-1][-1][-1]
continue next_first_line = layout_paras[i][0][0]
pre_last_line = layout_paras[i-1][-1]
next_first_line = layout_paras[i][0]
pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']]) pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
pre_last_line_type = pre_last_line['spans'][-1]['type'] pre_last_line_type = pre_last_line['spans'][-1]['type']
next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']]) next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
next_first_line_type = next_first_line['spans'][0]['type'] next_first_line_type = next_first_line['spans'][0]['type']
if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]: # TODO,真的要做好,要考虑跨table, image, 行间的情况 if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
connected_layout_paras.append(para) connected_layout_paras.append(layout_paras[i])
continue continue
pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)[2] pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)[2]
next_x0_min = __find_layout_bbox_by_line(next_first_line['bbox'], new_layout_bbox)[0] next_x0_min = __find_layout_bbox_by_line(next_first_line['bbox'], new_layout_bbox)[0]
...@@ -385,10 +360,15 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, line_group_end ...@@ -385,10 +360,15 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, line_group_end
next_first_line_text = next_first_line_text.strip() next_first_line_text = next_first_line_text.strip()
if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and next_first_line['bbox'][0]==next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。 if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and next_first_line['bbox'][0]==next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
"""连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。""" """连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
connected_layout_paras[-1].extend(para) connected_layout_paras[-1][-1].extend(layout_paras[i][0])
layout_paras[i].pop(0) # 删除后一个layout的第一个段落, 因为他已经被合并到前一个layout的最后一个段落了。
if len(layout_paras[i])==0:
layout_paras.pop(i)
else:
connected_layout_paras.append(layout_paras[i])
else: else:
"""连接段落条件不成立,将前一个layout的段落加入到结果中。""" """连接段落条件不成立,将前一个layout的段落加入到结果中。"""
connected_layout_paras.append(para) connected_layout_paras.append(layout_paras[i])
return connected_layout_paras return connected_layout_paras
...@@ -403,8 +383,8 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b ...@@ -403,8 +383,8 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
# 有的页面可能压根没有文字 # 有的页面可能压根没有文字
if len(pre_page_paras)==0 or len(next_page_paras)==0: if len(pre_page_paras)==0 or len(next_page_paras)==0:
return False return False
pre_last_para = pre_page_paras[-1] pre_last_para = pre_page_paras[-1][-1]
next_first_para = next_page_paras[0] next_first_para = next_page_paras[0][0]
pre_last_line = pre_last_para[-1] pre_last_line = pre_last_para[-1]
next_first_line = next_first_para[0] next_first_line = next_first_para[0]
pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']]) pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
...@@ -423,8 +403,8 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b ...@@ -423,8 +403,8 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
next_first_line_text = next_first_line_text.strip() next_first_line_text = next_first_line_text.strip()
if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and next_first_line['bbox'][0]==next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。 if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and next_first_line['bbox'][0]==next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
"""连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。""" """连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
pre_page_paras[-1].extend(next_first_para) pre_last_para.extend(next_first_para)
next_page_paras.pop(0) # 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。 next_page_paras[0].pop(0) # 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。
return True return True
else: else:
return False return False
...@@ -443,8 +423,10 @@ def __do_split(blocks, layout_bboxes, new_layout_bbox, lang="en"): ...@@ -443,8 +423,10 @@ def __do_split(blocks, layout_bboxes, new_layout_bbox, lang="en"):
4. 图、表,目前独占一行,不考虑分段。 4. 图、表,目前独占一行,不考虑分段。
""" """
lines_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段 lines_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段
layout_paras, line_group_end_with_list = __split_para_in_layoutbox(lines_group, new_layout_bbox, lang) # layout内分段 layout_paras, layout_list_info = __split_para_in_layoutbox(lines_group, new_layout_bbox, lang) # layout内分段
connected_layout_paras = __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, line_group_end_with_list, lang) # layout间链接段落 layout_paras2 = __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, lang) # layout之间连接列表段落
connected_layout_paras = __connect_para_inter_layoutbox(layout_paras2, new_layout_bbox, lang) # layout间链接段落
return connected_layout_paras return connected_layout_paras
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment