Unverified Commit 36e86dcb authored by icecraft's avatar icecraft Committed by GitHub

Merge pull request #7 from myhloli/feat/add_layout

feat: add layout
parents 1d5d7781 4f1f7d62
......@@ -620,9 +620,9 @@ def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> d
jso["need_drop"] = True
jso["drop_reason"] = pdf_info_dict["drop_reason"]
else: # 正常返回,将 pdf_info_dict 压缩并存储
jso["parsed_results"] = convert_to_train_format(pdf_info_dict)
pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
jso["pdf_intermediate_dict"] = pdf_info_dict
jso["parsed_results"] = convert_to_train_format(pdf_info_dict)
end_time = time.time() # 记录完成时间
parse_time = int(end_time - start_time) # 计算执行时间
# 解析完成后打印一下book_name和耗时
......
def convert_to_train_format(jso: dict) -> []:
pages = []
for k, v in jso.items():
if not k.startswith("page_"):
continue
page_idx = v["page_idx"]
width, height = v["page_size"]
......@@ -47,6 +47,7 @@ def convert_to_train_format(jso: dict) -> []:
bboxes.append(n_bbox)
info["bboxes"] = bboxes
info["layout_tree"] = v["layout_bboxes"]
pages.append(info)
return pages
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment