Commit caa1588a authored by 赵小蒙's avatar 赵小蒙

ocr拼接逻辑更新

parent a0be4652
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio
def merge_spans(spans): # 删除重叠spans中较小的那些
def remove_overlaps_min_spans(spans):
for span1 in spans.copy():
for span2 in spans.copy():
if span1 != span2:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
if overlap_box is not None:
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if bbox_to_remove is not None:
spans.remove(bbox_to_remove)
return spans
def merge_spans_to_line(spans):
# 按照y0坐标排序 # 按照y0坐标排序
spans.sort(key=lambda span: span['bbox'][1]) spans.sort(key=lambda span: span['bbox'][1])
...@@ -9,7 +22,8 @@ def merge_spans(spans): ...@@ -9,7 +22,8 @@ def merge_spans(spans):
current_line = [spans[0]] current_line = [spans[0]]
for span in spans[1:]: for span in spans[1:]:
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation" # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
if span['type'] == "displayed_equation" or any(s['type'] == "displayed_equation" for s in current_line): # image和table类型,同上
if span['type'] in ["displayed_equation", "image", "table"] or any(s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
# 则开始新行 # 则开始新行
lines.append(current_line) lines.append(current_line)
current_line = [span] current_line = [span]
......
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio from loguru import logger
from magic_pdf.libs.ocr_dict_merge import merge_spans
from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
def construct_page_component(page_id, text_blocks_preproc):
def construct_page_component(page_id, blocks):
return_dict = { return_dict = {
'preproc_blocks': text_blocks_preproc, 'preproc_blocks': blocks,
'page_idx': page_id 'page_idx': page_id,
} }
return return_dict return return_dict
...@@ -24,17 +25,32 @@ def parse_pdf_by_ocr( ...@@ -24,17 +25,32 @@ def parse_pdf_by_ocr(
spans = [] spans = []
for layout_det in layout_dets: for layout_det in layout_dets:
category_id = layout_det['category_id'] category_id = layout_det['category_id']
allow_category_id_list = [13, 14, 15] allow_category_id_list = [1, 7, 13, 14, 15]
if category_id in allow_category_id_list: if category_id in allow_category_id_list:
x0, y0, _, _, x1, y1, _, _ = layout_det['poly'] x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
bbox = [int(x0), int(y0), int(x1), int(y1)] bbox = [int(x0), int(y0), int(x1), int(y1)]
# 13: 'embedding', # 嵌入公式 '''要删除的'''
# 14: 'isolated', # 单行公式 # 3: 'header', # 页眉
# 15: 'ocr_text', # ocr识别文本 # 4: 'page number', # 页码
# 5: 'footnote', # 脚注
# 6: 'footer', # 页脚
'''当成span拼接的'''
# 1: 'image', # 图片
# 7: 'table', # 表格
# 13: 'inline_equation', # 行内公式
# 14: 'displayed_equation', # 行间公式
# 15: 'text', # ocr识别文本
'''layout信息'''
# 11: 'full column', # 单栏
# 12: 'sub column', # 多栏
span = { span = {
'bbox': bbox, 'bbox': bbox,
} }
if category_id == 13: if category_id == 1:
span['type'] = 'image'
elif category_id == 7:
span['type'] = 'table'
elif category_id == 13:
span['content'] = layout_det['latex'] span['content'] = layout_det['latex']
span['type'] = 'inline_equation' span['type'] = 'inline_equation'
elif category_id == 14: elif category_id == 14:
...@@ -48,18 +64,18 @@ def parse_pdf_by_ocr( ...@@ -48,18 +64,18 @@ def parse_pdf_by_ocr(
else: else:
continue continue
# 合并重叠的spans # 删除重叠spans中较小的那些
for span1 in spans.copy(): spans = remove_overlaps_min_spans(spans)
for span2 in spans.copy():
if span1 != span2: # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
if overlap_box is not None:
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None) # 将spans合并成line(从上到下,从左到右)
if bbox_to_remove is not None: lines = merge_spans_to_line(spans)
spans.remove(bbox_to_remove) # logger.info(lines)
# 将spans合并成line # 从ocr_page_info中获取layout信息
lines = merge_spans(spans)
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks = [] blocks = []
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment