Commit 5b9fa871 authored by 赵小蒙's avatar 赵小蒙

Merge remote-tracking branch 'origin/master'

parents 71a042d9 bc339320
......@@ -59,6 +59,7 @@ from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
from magic_pdf.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
from magic_pdf.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
from magic_pdf.pre_proc.solve_line_alien import solve_inline_too_large_interval
denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
titleDetectionException_msg = TitleDetectionException().message
......@@ -446,6 +447,10 @@ def parse_pdf_by_model(
==================================================================================================================================
进入段落处理-2阶段
"""
# 处理行内文字间距较大问题
pdf_info_dict = solve_inline_too_large_interval(pdf_info_dict)
start_time = time.time()
para_process_pipeline = ParaProcessPipeline()
......
from magic_pdf.libs.boxbase import _is_in # 正则
from magic_pdf.libs.boxbase import _is_in, calculate_overlap_area_2_minbox_area_ratio # 正则
from magic_pdf.libs.commons import fitz # pyMuPDF库
......@@ -18,7 +18,16 @@ def __solve_contain_bboxs(all_bbox_list: list):
dump_list.append(all_bbox_list[i])
elif _is_in(bbox2, bbox1):
dump_list.append(all_bbox_list[j])
else:
ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
if ratio > 0.7:
s1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
s2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
if s2 > s1:
dump_list.append(all_bbox_list[i])
else:
dump_list.append(all_bbox_list[i])
# 遍历需要删除的列表中的每个元素
for item in dump_list:
......
def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block
"""解决行内文本间距过大问题"""
for i in range(len(pdf_info_dict)):
text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
for block in text_blocks:
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
for line in block['lines']:
x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
# line_box = [x1, y1, x2, y2]
if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
# if len(line['spans']) == 1:
line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox']
return pdf_info_dict
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment