Commit e31066ba authored by liukaiwen's avatar liukaiwen

更新了para_split

parent f519f63d
......@@ -256,7 +256,7 @@ def __split_para_in_layoutbox2(lines_group, new_layout_bbox, lang="en", char_avg
def __split_para_in_layoutbox(blocks_group, new_layout_bbox, text_blocks, lang="en", char_avg_len=10):
def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en", char_avg_len=10):
"""
lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
1. 先计算每个group的左右边界。
......@@ -624,7 +624,7 @@ def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang):
pass
def __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_num, lang):
def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
"""
根据line和layout情况进行分段
先实现一个根据行末尾特征分段的简单方法。
......@@ -637,7 +637,7 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_nu
4. 图、表,目前独占一行,不考虑分段。
"""
lines_group, blocks_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段
layout_list_info = __split_para_in_layoutbox(blocks_group, new_layout_bbox, text_blocks, lang) # layout内分段
layout_list_info = __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang) # layout内分段
blocks_group, page_list_info = __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
page_num, lang) # layout之间连接列表段落
connected_layout_blocks = __connect_para_inter_layoutbox(blocks_group, new_layout_bbox, lang) # layout间链接段落
......@@ -646,16 +646,15 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_nu
def para_split_by_model(pdf_info_dict, debug_mode, magic_model: MagicModel, lang="en"):
def para_split(pdf_info_dict, debug_mode, lang="en"):
new_layout_of_pages = [] # 数组的数组,每个元素是一个页面的layoutS
all_page_list_info = [] # 保存每个页面开头和结尾是否是列表
for page_num, page in pdf_info_dict.items():
blocks = page['preproc_blocks']
layout_bboxes = page['layout_bboxes']
text_blocks = magic_model.get_text_blocks(page_num)
new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
new_layout_of_pages.append(new_layout_bbox)
splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_num, lang)
splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang)
all_page_list_info.append(page_list_info)
page['para_blocks'] = splited_blocks
......
......@@ -11,7 +11,6 @@ from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.para.para_split import para_split
from magic_pdf.para.para_split_by_model import para_split_by_model
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
......@@ -209,9 +208,7 @@ def parse_pdf_by_ocr(
pdf_info_dict[f"page_{page_id}"] = page_info
"""分段"""
if debug_mode:
para_split_by_model(pdf_info_dict, debug_mode=debug_mode)
else:
para_split(pdf_info_dict, debug_mode=debug_mode)
"""dict转list"""
......
......@@ -12,8 +12,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans
from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
from magic_pdf.para.para_split import para_split
from magic_pdf.para.para_split_by_model import para_split_by_model
# from magic_pdf.para.para_split import para_split
from magic_pdf.para.para_split_v2 import para_split
def parse_pdf_by_ocr(pdf_bytes,
......@@ -92,9 +92,7 @@ def parse_pdf_by_ocr(pdf_bytes,
pdf_info_dict[f"page_{page_id}"] = page_info
"""分段"""
if debug_mode:
para_split_by_model(pdf_info_dict, debug_mode=debug_mode)
else:
# if debug_mode:
para_split(pdf_info_dict, debug_mode=debug_mode)
"""dict转list"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment