Unverified Commit 59d817b1 authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #2 from icecraft/feat/proc_bbox

Feat/proc bbox
parents 070139a5 2611e853
...@@ -5,7 +5,14 @@ import time ...@@ -5,7 +5,14 @@ import time
from loguru import logger from loguru import logger
from demo.draw_bbox import draw_layout_bbox, draw_text_bbox from demo.draw_bbox import draw_layout_bbox, draw_text_bbox
from magic_pdf.libs.commons import read_file, join_path, fitz, get_img_s3_client, get_delta_time, get_docx_model_output from magic_pdf.libs.commons import (
read_file,
join_path,
fitz,
get_img_s3_client,
get_delta_time,
get_docx_model_output,
)
from magic_pdf.libs.coordinate_transform import get_scale_ratio from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.safe_filename import sanitize_filename from magic_pdf.libs.safe_filename import sanitize_filename
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
...@@ -14,8 +21,12 @@ from magic_pdf.pre_proc.detect_header import parse_headers ...@@ -14,8 +21,12 @@ from magic_pdf.pre_proc.detect_header import parse_headers
from magic_pdf.pre_proc.detect_page_number import parse_pageNos from magic_pdf.pre_proc.detect_page_number import parse_pageNos
from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
from magic_pdf.pre_proc.ocr_dict_merge import remove_overlaps_min_spans, merge_spans_to_line_by_layout from magic_pdf.pre_proc.ocr_dict_merge import (
remove_overlaps_min_spans,
merge_spans_to_line_by_layout,
)
from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree): def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree):
...@@ -81,18 +92,23 @@ def parse_pdf_by_ocr( ...@@ -81,18 +92,23 @@ def parse_pdf_by_ocr(
if debug_mode: if debug_mode:
time_now = time.time() time_now = time.time()
logger.info(f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}") logger.info(
f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
)
start_time = time_now start_time = time_now
# 获取当前页的模型数据 # 获取当前页的模型数据
ocr_page_info = get_docx_model_output(pdf_model_output, pdf_model_profile, page_id) ocr_page_info = get_docx_model_output(
pdf_model_output, pdf_model_profile, page_id
)
"""从json中获取每页的页码、页眉、页脚的bbox""" """从json中获取每页的页码、页眉、页脚的bbox"""
page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info) page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info)
header_bboxes = parse_headers(page_id, page, ocr_page_info) header_bboxes = parse_headers(page_id, page, ocr_page_info)
footer_bboxes = parse_footers(page_id, page, ocr_page_info) footer_bboxes = parse_footers(page_id, page, ocr_page_info)
footnote_bboxes = parse_footnotes_by_model(page_id, page, ocr_page_info, md_bookname_save_path, footnote_bboxes = parse_footnotes_by_model(
debug_mode=debug_mode) page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode
)
# 构建需要remove的bbox列表 # 构建需要remove的bbox列表
need_remove_spans_bboxes = [] need_remove_spans_bboxes = []
...@@ -101,51 +117,57 @@ def parse_pdf_by_ocr( ...@@ -101,51 +117,57 @@ def parse_pdf_by_ocr(
need_remove_spans_bboxes.extend(footer_bboxes) need_remove_spans_bboxes.extend(footer_bboxes)
need_remove_spans_bboxes.extend(footnote_bboxes) need_remove_spans_bboxes.extend(footnote_bboxes)
layout_dets = ocr_page_info['layout_dets'] layout_dets = ocr_page_info["layout_dets"]
spans = [] spans = []
# 计算模型坐标和pymu坐标的缩放比例 # 计算模型坐标和pymu坐标的缩放比例
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page) horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
ocr_page_info, page
)
for layout_det in layout_dets: for layout_det in layout_dets:
category_id = layout_det['category_id'] category_id = layout_det["category_id"]
allow_category_id_list = [1, 7, 13, 14, 15] allow_category_id_list = [1, 7, 13, 14, 15]
if category_id in allow_category_id_list: if category_id in allow_category_id_list:
x0, y0, _, _, x1, y1, _, _ = layout_det['poly'] x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio), bbox = [
int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)] int(x0 / horizontal_scale_ratio),
'''要删除的''' int(y0 / vertical_scale_ratio),
int(x1 / horizontal_scale_ratio),
int(y1 / vertical_scale_ratio),
]
"""要删除的"""
# 3: 'header', # 页眉 # 3: 'header', # 页眉
# 4: 'page number', # 页码 # 4: 'page number', # 页码
# 5: 'footnote', # 脚注 # 5: 'footnote', # 脚注
# 6: 'footer', # 页脚 # 6: 'footer', # 页脚
'''当成span拼接的''' """当成span拼接的"""
# 1: 'image', # 图片 # 1: 'image', # 图片
# 7: 'table', # 表格 # 7: 'table', # 表格
# 13: 'inline_equation', # 行内公式 # 13: 'inline_equation', # 行内公式
# 14: 'displayed_equation', # 行间公式 # 14: 'displayed_equation', # 行间公式
# 15: 'text', # ocr识别文本 # 15: 'text', # ocr识别文本
'''layout信息''' """layout信息"""
# 11: 'full column', # 单栏 # 11: 'full column', # 单栏
# 12: 'sub column', # 多栏 # 12: 'sub column', # 多栏
span = { span = {
'bbox': bbox, "bbox": bbox,
} }
if category_id == 1: if category_id == 1:
span['type'] = 'image' span["type"] = "image"
elif category_id == 7: elif category_id == 7:
span['type'] = 'table' span["type"] = "table"
elif category_id == 13: elif category_id == 13:
span['content'] = layout_det['latex'] span["content"] = layout_det["latex"]
span['type'] = 'inline_equation' span["type"] = "inline_equation"
elif category_id == 14: elif category_id == 14:
span['content'] = layout_det['latex'] span["content"] = layout_det["latex"]
span['type'] = 'displayed_equation' span["type"] = "displayed_equation"
elif category_id == 15: elif category_id == 15:
span['content'] = layout_det['text'] span["content"] = layout_det["text"]
span['type'] = 'text' span["type"] = "text"
# print(span) # print(span)
spans.append(span) spans.append(span)
else: else:
...@@ -160,12 +182,12 @@ def parse_pdf_by_ocr( ...@@ -160,12 +182,12 @@ def parse_pdf_by_ocr(
# 对image和table截图 # 对image和table截图
spans = cut_image_and_table(spans, page, page_id, book_name, save_path) spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧) # 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
# 模型识别错误的行间公式, type类型转换成行内公式 # 模型识别错误的行间公式, type类型转换成行内公式
# bbox去除粘连 # bbox去除粘连
spans = remove_overlap_between_bbox(spans)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0 # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
...@@ -175,14 +197,15 @@ def parse_pdf_by_ocr( ...@@ -175,14 +197,15 @@ def parse_pdf_by_ocr(
# 将spans合并成line(在layout内,从上到下,从左到右) # 将spans合并成line(在layout内,从上到下,从左到右)
lines = merge_spans_to_line_by_layout(spans, layout_bboxes) lines = merge_spans_to_line_by_layout(spans, layout_bboxes)
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks = [] blocks = []
for line in lines: for line in lines:
blocks.append({ blocks.append(
"bbox": line['bbox'], {
"bbox": line["bbox"],
"lines": [line], "lines": [line],
}) }
)
# 构造pdf_info_dict # 构造pdf_info_dict
page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree) page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree)
...@@ -190,7 +213,9 @@ def parse_pdf_by_ocr( ...@@ -190,7 +213,9 @@ def parse_pdf_by_ocr(
# 在测试时,保存调试信息 # 在测试时,保存调试信息
if debug_mode: if debug_mode:
params_file_save_path = join_path(save_tmp_path, "md", book_name, "preproc_out.json") params_file_save_path = join_path(
save_tmp_path, "md", book_name, "preproc_out.json"
)
with open(params_file_save_path, "w", encoding="utf-8") as f: with open(params_file_save_path, "w", encoding="utf-8") as f:
json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4) json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
...@@ -198,5 +223,4 @@ def parse_pdf_by_ocr( ...@@ -198,5 +223,4 @@ def parse_pdf_by_ocr(
draw_layout_bbox(pdf_info_dict, pdf_path, md_bookname_save_path) draw_layout_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
draw_text_bbox(pdf_info_dict, pdf_path, md_bookname_save_path) draw_text_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
return pdf_info_dict return pdf_info_dict
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in
def _remove_overlap_between_bbox(spans):
res = []
for v in spans:
for i in range(len(res)):
if _is_in(res[i]["bbox"], v["bbox"]):
continue
if _is_in_or_part_overlap(res[i]["bbox"], v["bbox"]):
ix0, iy0, ix1, iy1 = res[i]["bbox"]
x0, y0, x1, y1 = v["bbox"]
diff_x = min(x1, ix1) - max(x0, ix0)
diff_y = min(y1, iy1) - max(y0, iy0)
if diff_y > diff_x:
if x1 >= ix1:
mid = (x0 + ix1) // 2
ix1 = min(mid, ix1)
x0 = max(mid + 1, x0)
else:
mid = (ix0 + x1) // 2
ix0 = max(mid + 1, ix0)
x1 = min(mid, x1)
else:
if y1 >= iy1:
mid = (y0 + iy1) // 2
y0 = max(mid + 1, y0)
iy1 = min(iy1, mid)
else:
mid = (iy0 + y1) // 2
y1 = min(y1, mid)
iy0 = max(mid + 1, iy0)
res[i]["bbox"] = [ix0, iy0, ix1, iy1]
v["bbox"] = [x0, y0, x1, y1]
res.append(v)
return res
def remove_overlap_between_bbox(spans):
return _remove_overlap_between_bbox(spans)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment