Commit b94fd7f0 authored by 赵小蒙's avatar 赵小蒙

Merge remote-tracking branch 'origin/master'

parents 755ea5b0 36e86dcb
......@@ -620,9 +620,9 @@ def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> d
jso["need_drop"] = True
jso["drop_reason"] = pdf_info_dict["drop_reason"]
else: # 正常返回,将 pdf_info_dict 压缩并存储
jso["parsed_results"] = convert_to_train_format(pdf_info_dict)
pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
jso["pdf_intermediate_dict"] = pdf_info_dict
jso["parsed_results"] = convert_to_train_format(pdf_info_dict)
end_time = time.time() # 记录完成时间
parse_time = int(end_time - start_time) # 计算执行时间
# 解析完成后打印一下book_name和耗时
......
def convert_to_train_format(jso: dict) -> []:
pages = []
for k, v in jso.items():
if not k.startswith("page_"):
continue
page_idx = v["page_idx"]
width, height = v["page_size"]
......@@ -47,6 +47,7 @@ def convert_to_train_format(jso: dict) -> []:
bboxes.append(n_bbox)
info["bboxes"] = bboxes
info["layout_tree"] = v["layout_bboxes"]
pages.append(info)
return pages
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment