Commit f9bd0040 authored by 赵小蒙's avatar 赵小蒙

对模型的layout坐标转换

parent f62d1aa7
...@@ -55,7 +55,6 @@ def parse_pdf_by_ocr( ...@@ -55,7 +55,6 @@ def parse_pdf_by_ocr(
with open(pdf_local_path + ".pdf", "wb") as pdf_file: with open(pdf_local_path + ".pdf", "wb") as pdf_file:
pdf_file.write(pdf_bytes) pdf_file.write(pdf_bytes)
pdf_docs = fitz.open("pdf", pdf_bytes) pdf_docs = fitz.open("pdf", pdf_bytes)
# 初始化空的pdf_info_dict # 初始化空的pdf_info_dict
pdf_info_dict = {} pdf_info_dict = {}
...@@ -83,7 +82,8 @@ def parse_pdf_by_ocr( ...@@ -83,7 +82,8 @@ def parse_pdf_by_ocr(
page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info) page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info)
header_bboxes = parse_headers(page_id, page, ocr_page_info) header_bboxes = parse_headers(page_id, page, ocr_page_info)
footer_bboxes = parse_footers(page_id, page, ocr_page_info) footer_bboxes = parse_footers(page_id, page, ocr_page_info)
footnote_bboxes = parse_footnotes_by_model(page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode) footnote_bboxes = parse_footnotes_by_model(page_id, page, ocr_page_info, md_bookname_save_path,
debug_mode=debug_mode)
# 构建需要remove的bbox列表 # 构建需要remove的bbox列表
need_remove_spans_bboxes = [] need_remove_spans_bboxes = []
...@@ -103,7 +103,8 @@ def parse_pdf_by_ocr( ...@@ -103,7 +103,8 @@ def parse_pdf_by_ocr(
allow_category_id_list = [1, 7, 13, 14, 15] allow_category_id_list = [1, 7, 13, 14, 15]
if category_id in allow_category_id_list: if category_id in allow_category_id_list:
x0, y0, _, _, x1, y1, _, _ = layout_det['poly'] x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
bbox = [int(x0/horizontal_scale_ratio), int(y0/vertical_scale_ratio), int(x1/horizontal_scale_ratio), int(y1/vertical_scale_ratio)] bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
'''要删除的''' '''要删除的'''
# 3: 'header', # 页眉 # 3: 'header', # 页眉
# 4: 'page number', # 页码 # 4: 'page number', # 页码
...@@ -149,9 +150,11 @@ def parse_pdf_by_ocr( ...@@ -149,9 +150,11 @@ def parse_pdf_by_ocr(
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0 # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes = layout_detect(ocr_page_info['subfield_dets'], page)
# 将spans合并成line(从上到下,从左到右) # 将spans合并成line(在layout内,从上到下,从左到右)
lines = merge_spans_to_line(spans) lines = merge_spans_to_line(spans, layout_bboxes)
# logger.info(lines) # logger.info(lines)
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
...@@ -162,12 +165,8 @@ def parse_pdf_by_ocr( ...@@ -162,12 +165,8 @@ def parse_pdf_by_ocr(
"lines": [line], "lines": [line],
}) })
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes = layout_detect(ocr_page_info['subfield_dets'])
# 构造pdf_info_dict # 构造pdf_info_dict
page_info = construct_page_component(page_id, blocks, layout_bboxes) page_info = construct_page_component(page_id, blocks, layout_bboxes)
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f"page_{page_id}"] = page_info
return pdf_info_dict return pdf_info_dict
import fitz
from magic_pdf.libs.boxbase import _is_part_overlap, _is_in from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
from magic_pdf.libs.coordinate_transform import get_scale_ratio
def get_center_point(bbox): def get_center_point(bbox):
""" """
...@@ -62,9 +66,7 @@ def adjust_layouts(layout_bboxes): ...@@ -62,9 +66,7 @@ def adjust_layouts(layout_bboxes):
return layout_bboxes return layout_bboxes
def layout_detect(layout_info, page: fitz.Page):
def layout_detect(layout_info):
""" """
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。 对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
...@@ -75,15 +77,18 @@ def layout_detect(layout_info): ...@@ -75,15 +77,18 @@ def layout_detect(layout_info):
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。 list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
""" """
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(layout_info, page)
# 初始化布局边界框列表 # 初始化布局边界框列表
layout_bboxes = [] layout_bboxes = []
# 遍历每个子布局 # 遍历每个子布局
for sub_layout in layout_info: for sub_layout in layout_info:
# 提取子布局的边界框坐标信息 # 提取子布局的边界框坐标信息
x0, y0, _, _, x1, y1, _, _ = sub_layout['poly'] x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
# 创建子布局的边界框字典 # 创建子布局的边界框字典
layout_bbox = { layout_bbox = {
"layout_bbox": [x0, y0, x1, y1], "layout_bbox": bbox,
} }
# 将子布局的边界框添加到列表中 # 将子布局的边界框添加到列表中
layout_bboxes.append(layout_bbox) layout_bboxes.append(layout_bbox)
...@@ -119,5 +124,3 @@ def layout_detect(layout_info): ...@@ -119,5 +124,3 @@ def layout_detect(layout_info):
# 返回排序调整后的布局边界框列表 # 返回排序调整后的布局边界框列表
return layout_bboxes return layout_bboxes
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment