Commit 34bde6d8 authored by 赵小蒙's avatar 赵小蒙

classify后在jso根层级添加_pdf_type标识,同时取消对非文本类pdf的drop

parent f65be6e0
...@@ -130,6 +130,7 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict: ...@@ -130,6 +130,7 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
classify_time = int(time.time() - start_time) # 计算执行时间 classify_time = int(time.time() - start_time) # 计算执行时间
if is_text_pdf: if is_text_pdf:
pdf_meta["is_text_pdf"] = is_text_pdf pdf_meta["is_text_pdf"] = is_text_pdf
jso["_pdf_type"] = "TXT"
jso["pdf_meta"] = pdf_meta jso["pdf_meta"] = pdf_meta
jso["classify_time"] = classify_time jso["classify_time"] = classify_time
# print(json.dumps(pdf_meta, ensure_ascii=False)) # print(json.dumps(pdf_meta, ensure_ascii=False))
...@@ -144,10 +145,11 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict: ...@@ -144,10 +145,11 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
else: else:
# 先不drop # 先不drop
pdf_meta["is_text_pdf"] = is_text_pdf pdf_meta["is_text_pdf"] = is_text_pdf
jso["_pdf_type"] = "OCR"
jso["pdf_meta"] = pdf_meta jso["pdf_meta"] = pdf_meta
jso["classify_time"] = classify_time jso["classify_time"] = classify_time
jso["need_drop"] = True # jso["need_drop"] = True
jso["drop_reason"] = DropReason.NOT_IS_TEXT_PDF # jso["drop_reason"] = DropReason.NOT_IS_TEXT_PDF
extra_info = {"classify_rules": []} extra_info = {"classify_rules": []}
for condition, result in results.items(): for condition, result in results.items():
if not result: if not result:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment