Commit bc339320 authored by hsy's avatar hsy

增加了一个solve_line_alien.py,在detect_equation.py中修改了__solve_contain_bboxs函数,并在pdf_...

增加了一个solve_line_alien.py,在detect_equation.py中修改了__solve_contain_bboxs函数,并在pdf_parse_by_model.py里增加了函数solve_line_alien.py的调用
parent 779d2e8a
...@@ -59,6 +59,7 @@ from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter ...@@ -59,6 +59,7 @@ from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
from magic_pdf.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict from magic_pdf.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
from magic_pdf.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title from magic_pdf.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
from magic_pdf.pre_proc.solve_line_alien import solve_inline_too_large_interval
denseSingleLineBlockException_msg = DenseSingleLineBlockException().message denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
titleDetectionException_msg = TitleDetectionException().message titleDetectionException_msg = TitleDetectionException().message
...@@ -446,6 +447,10 @@ def parse_pdf_by_model( ...@@ -446,6 +447,10 @@ def parse_pdf_by_model(
================================================================================================================================== ==================================================================================================================================
进入段落处理-2阶段 进入段落处理-2阶段
""" """
# 处理行内文字间距较大问题
pdf_info_dict = solve_inline_too_large_interval(pdf_info_dict)
start_time = time.time() start_time = time.time()
para_process_pipeline = ParaProcessPipeline() para_process_pipeline = ParaProcessPipeline()
......
from magic_pdf.libs.boxbase import _is_in # 正则 from magic_pdf.libs.boxbase import _is_in, calculate_overlap_area_2_minbox_area_ratio # 正则
from magic_pdf.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
...@@ -18,7 +18,16 @@ def __solve_contain_bboxs(all_bbox_list: list): ...@@ -18,7 +18,16 @@ def __solve_contain_bboxs(all_bbox_list: list):
dump_list.append(all_bbox_list[i]) dump_list.append(all_bbox_list[i])
elif _is_in(bbox2, bbox1): elif _is_in(bbox2, bbox1):
dump_list.append(all_bbox_list[j]) dump_list.append(all_bbox_list[j])
else:
ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
if ratio > 0.7:
s1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
s2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
if s2 > s1:
dump_list.append(all_bbox_list[i])
else:
dump_list.append(all_bbox_list[i])
# 遍历需要删除的列表中的每个元素 # 遍历需要删除的列表中的每个元素
for item in dump_list: for item in dump_list:
......
def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block
"""解决行内文本间距过大问题"""
for i in range(len(pdf_info_dict)):
text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
for block in text_blocks:
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
for line in block['lines']:
x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
# line_box = [x1, y1, x2, y2]
if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
# if len(line['spans']) == 1:
line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox']
return pdf_info_dict
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment