Commit 432e1ae5 authored by xu rui's avatar xu rui

feat: process title and footnote

parent e3e125ba
......@@ -253,6 +253,7 @@ def parse_pdf_for_train(
# isSimpleLayout_flag, fullColumn_cnt, subColumn_cnt, curPage_loss = evaluate_pdf_layout(page_id, page, model_output_json)
接下来开始进行预处理过程
"""
title_bboxs = parse_titles(page_id, page, model_output_json)
"""去掉每页的页码、页眉、页脚"""
page_no_bboxs = parse_pageNos(page_id, page, model_output_json)
......
......@@ -35,7 +35,15 @@ def convert_to_train_format(jso: dict) -> []:
# 脚注, 目前没有看到例子
for para in v["para_blocks"]:
n_bbox = {"category_id": 2, "bbox": para["bbox"]}
if "paras" in para:
paras = para["paras"]
for para_key, para_content in paras.items():
para_bbox = para_content["para_bbox"]
is_para_title = para_content["is_para_title"]
if is_para_title:
n_bbox = {"category_id": 0, "bbox": para_bbox}
else:
n_bbox = {"category_id": 2, "bbox": para_bbox}
bboxes.append(n_bbox)
for inline_equation in v["inline_equations"]:
......@@ -46,6 +54,10 @@ def convert_to_train_format(jso: dict) -> []:
n_bbox = {"category_id": 10, "bbox": inter_equation["bbox"]}
bboxes.append(n_bbox)
for footnote in v['footnote_bboxes_tmp']:
n_bbox = {"category_id": 5, "bbox": footnote["bbox"]}
bboxes.append(n_bbox)
info["bboxes"] = bboxes
info["layout_tree"] = v["layout_bboxes"]
pages.append(info)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment