Commit 4f1f7d62 authored by 许瑞's avatar 许瑞

feat: add layout

parent 90ea9096
...@@ -620,9 +620,9 @@ def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> d ...@@ -620,9 +620,9 @@ def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> d
jso["need_drop"] = True jso["need_drop"] = True
jso["drop_reason"] = pdf_info_dict["drop_reason"] jso["drop_reason"] = pdf_info_dict["drop_reason"]
else: # 正常返回,将 pdf_info_dict 压缩并存储 else: # 正常返回,将 pdf_info_dict 压缩并存储
jso["parsed_results"] = convert_to_train_format(pdf_info_dict)
pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict) pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
jso["pdf_intermediate_dict"] = pdf_info_dict jso["pdf_intermediate_dict"] = pdf_info_dict
jso["parsed_results"] = convert_to_train_format(pdf_info_dict)
end_time = time.time() # 记录完成时间 end_time = time.time() # 记录完成时间
parse_time = int(end_time - start_time) # 计算执行时间 parse_time = int(end_time - start_time) # 计算执行时间
# 解析完成后打印一下book_name和耗时 # 解析完成后打印一下book_name和耗时
......
def convert_to_train_format(jso: dict) -> []: def convert_to_train_format(jso: dict) -> []:
pages = [] pages = []
for k, v in jso.items(): for k, v in jso.items():
if not k.startswith("page_"):
continue
page_idx = v["page_idx"] page_idx = v["page_idx"]
width, height = v["page_size"] width, height = v["page_size"]
...@@ -47,6 +47,7 @@ def convert_to_train_format(jso: dict) -> []: ...@@ -47,6 +47,7 @@ def convert_to_train_format(jso: dict) -> []:
bboxes.append(n_bbox) bboxes.append(n_bbox)
info["bboxes"] = bboxes info["bboxes"] = bboxes
info["layout_tree"] = v["layout_bboxes"]
pages.append(info) pages.append(info)
return pages return pages
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment