Unverified Commit af84a8ac authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #40 from papayalove/master

更新了para_split
parents 61572264 9d522ab3
...@@ -256,7 +256,7 @@ def __split_para_in_layoutbox2(lines_group, new_layout_bbox, lang="en", char_avg ...@@ -256,7 +256,7 @@ def __split_para_in_layoutbox2(lines_group, new_layout_bbox, lang="en", char_avg
def __split_para_in_layoutbox(blocks_group, new_layout_bbox, text_blocks, lang="en", char_avg_len=10): def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en", char_avg_len=10):
""" """
lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。 lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
1. 先计算每个group的左右边界。 1. 先计算每个group的左右边界。
...@@ -624,7 +624,7 @@ def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang): ...@@ -624,7 +624,7 @@ def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang):
pass pass
def __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_num, lang): def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
""" """
根据line和layout情况进行分段 根据line和layout情况进行分段
先实现一个根据行末尾特征分段的简单方法。 先实现一个根据行末尾特征分段的简单方法。
...@@ -637,7 +637,7 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_nu ...@@ -637,7 +637,7 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_nu
4. 图、表,目前独占一行,不考虑分段。 4. 图、表,目前独占一行,不考虑分段。
""" """
lines_group, blocks_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段 lines_group, blocks_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段
layout_list_info = __split_para_in_layoutbox(blocks_group, new_layout_bbox, text_blocks, lang) # layout内分段 layout_list_info = __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang) # layout内分段
blocks_group, page_list_info = __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info, blocks_group, page_list_info = __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
page_num, lang) # layout之间连接列表段落 page_num, lang) # layout之间连接列表段落
connected_layout_blocks = __connect_para_inter_layoutbox(blocks_group, new_layout_bbox, lang) # layout间链接段落 connected_layout_blocks = __connect_para_inter_layoutbox(blocks_group, new_layout_bbox, lang) # layout间链接段落
...@@ -646,16 +646,15 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_nu ...@@ -646,16 +646,15 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_nu
def para_split_by_model(pdf_info_dict, debug_mode, magic_model: MagicModel, lang="en"): def para_split(pdf_info_dict, debug_mode, lang="en"):
new_layout_of_pages = [] # 数组的数组,每个元素是一个页面的layoutS new_layout_of_pages = [] # 数组的数组,每个元素是一个页面的layoutS
all_page_list_info = [] # 保存每个页面开头和结尾是否是列表 all_page_list_info = [] # 保存每个页面开头和结尾是否是列表
for page_num, page in pdf_info_dict.items(): for page_num, page in pdf_info_dict.items():
blocks = page['preproc_blocks'] blocks = page['preproc_blocks']
layout_bboxes = page['layout_bboxes'] layout_bboxes = page['layout_bboxes']
text_blocks = magic_model.get_text_blocks(page_num)
new_layout_bbox = __common_pre_proc(blocks, layout_bboxes) new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
new_layout_of_pages.append(new_layout_bbox) new_layout_of_pages.append(new_layout_bbox)
splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_num, lang) splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang)
all_page_list_info.append(page_list_info) all_page_list_info.append(page_list_info)
page['para_blocks'] = splited_blocks page['para_blocks'] = splited_blocks
......
...@@ -11,7 +11,6 @@ from magic_pdf.libs.drop_tag import DropTag ...@@ -11,7 +11,6 @@ from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.hash_utils import compute_md5 from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.ocr_content_type import ContentType from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.para.para_split import para_split from magic_pdf.para.para_split import para_split
from magic_pdf.para.para_split_by_model import para_split_by_model
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
...@@ -209,9 +208,7 @@ def parse_pdf_by_ocr( ...@@ -209,9 +208,7 @@ def parse_pdf_by_ocr(
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f"page_{page_id}"] = page_info
"""分段""" """分段"""
if debug_mode:
para_split_by_model(pdf_info_dict, debug_mode=debug_mode)
else:
para_split(pdf_info_dict, debug_mode=debug_mode) para_split(pdf_info_dict, debug_mode=debug_mode)
"""dict转list""" """dict转list"""
......
...@@ -12,8 +12,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table ...@@ -12,8 +12,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans
from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2 from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
from magic_pdf.para.para_split import para_split # from magic_pdf.para.para_split import para_split
from magic_pdf.para.para_split_by_model import para_split_by_model from magic_pdf.para.para_split_v2 import para_split
def parse_pdf_by_ocr(pdf_bytes, def parse_pdf_by_ocr(pdf_bytes,
...@@ -96,9 +96,7 @@ def parse_pdf_by_ocr(pdf_bytes, ...@@ -96,9 +96,7 @@ def parse_pdf_by_ocr(pdf_bytes,
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f"page_{page_id}"] = page_info
"""分段""" """分段"""
if debug_mode: # if debug_mode:
para_split_by_model(pdf_info_dict, debug_mode=debug_mode)
else:
para_split(pdf_info_dict, debug_mode=debug_mode) para_split(pdf_info_dict, debug_mode=debug_mode)
"""dict转list""" """dict转list"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment