Unverified Commit 81eeef3a authored by myhloli's avatar myhloli Committed by GitHub

Merge branch 'magicpdf:master' into master

parents d7128a9d c23883b6
...@@ -132,7 +132,8 @@ def __valign_lines(blocks, layout_bboxes): ...@@ -132,7 +132,8 @@ def __valign_lines(blocks, layout_bboxes):
for layout_box in layout_bboxes: for layout_box in layout_bboxes:
blocks_in_layoutbox = [b for b in blocks if b["type"] == BlockType.Text and is_in_layout(b['bbox'], layout_box['layout_bbox'])] blocks_in_layoutbox = [b for b in blocks if b["type"] == BlockType.Text and is_in_layout(b['bbox'], layout_box['layout_bbox'])]
if len(blocks_in_layoutbox) == 0: if len(blocks_in_layoutbox) == 0 or len(blocks_in_layoutbox[0]["lines"]) == 0:
new_layout_bboxes.append(layout_box['layout_bbox'])
continue continue
x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']]) x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']])
...@@ -400,6 +401,8 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b ...@@ -400,6 +401,8 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
""" """
if len(pre_page_paras) == 0 or len(next_page_paras) == 0: # 0的时候最后的return 会出错 if len(pre_page_paras) == 0 or len(next_page_paras) == 0: # 0的时候最后的return 会出错
return False return False
if len(pre_page_paras[-1]) == 0 or len(next_page_paras[0]) == 0:
return False
if pre_page_paras[-1][-1]["type"] != BlockType.Text or next_page_paras[0][0]["type"] != BlockType.Text: if pre_page_paras[-1][-1]["type"] != BlockType.Text or next_page_paras[0][0]["type"] != BlockType.Text:
return False return False
if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进 if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
...@@ -693,4 +696,10 @@ def para_split(pdf_info_dict, debug_mode, lang="en"): ...@@ -693,4 +696,10 @@ def para_split(pdf_info_dict, debug_mode, lang="en"):
page_paras = page['para_blocks'] page_paras = page['para_blocks']
new_layout_bbox = new_layout_of_pages[page_num] new_layout_bbox = new_layout_of_pages[page_num]
__connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, debug_mode=debug_mode) __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, debug_mode=debug_mode)
__merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang) __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang)
\ No newline at end of file
# layout展平
for page_num, page in enumerate(pdf_info_dict.values()):
page_paras = page['para_blocks']
page_blocks = [block for layout in page_paras for block in layout]
page["para_blocks"] = page_blocks
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment