Unverified Commit 9d1aee32 authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #33 from myhloli/master

基于block内对spans做一些fix
parents 8973b939 54f165aa
...@@ -5,6 +5,8 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox ...@@ -5,6 +5,8 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox
from magic_pdf.libs.drop_tag import DropTag from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType, BlockType from magic_pdf.libs.ocr_content_type import ContentType, BlockType
from magic_pdf.pre_proc.ocr_fix_block_logic import fix_image_block, fix_table_block, fix_text_block from magic_pdf.pre_proc.ocr_fix_block_logic import fix_image_block, fix_table_block, fix_text_block
from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
# 将每一个line中的span从左到右排序 # 将每一个line中的span从左到右排序
...@@ -157,6 +159,18 @@ def fill_spans_in_blocks(blocks, spans): ...@@ -157,6 +159,18 @@ def fill_spans_in_blocks(blocks, spans):
span_bbox = span['bbox'] span_bbox = span['bbox']
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.8: if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.8:
block_spans.append(span) block_spans.append(span)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list = []
text_inline_lines = []
modify_y_axis(block_spans, displayed_list, text_inline_lines)
'''模型识别错误的行间公式, type类型转换成行内公式'''
block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
'''bbox去除粘连'''
block_spans = remove_overlap_between_bbox(block_spans)
block_dict['spans'] = block_spans block_dict['spans'] = block_spans
block_with_spans.append(block_dict) block_with_spans.append(block_dict)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment