Commit 06063014 authored by 赵小蒙's avatar 赵小蒙

make paddle analyze mode adaptation cli input mode to improve analyze speed

parent 39b46ea9
......@@ -87,6 +87,11 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
sys.exit(1)
pipe.pipe_classify()
'''如果没有传入有效的模型数据,则使用内置paddle解析'''
if len(model_list) == 0:
pipe.pipe_analyze()
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data['pdf_info']
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
......@@ -255,8 +260,8 @@ def pdf_command(pdf, model, method):
model_path = pdf.replace(".pdf", ".json")
if not os.path.exists(model_path):
logger.warning(f"not found json {model_path} existed, use paddle analyze")
# 本地无模型数据则调用内置paddle分析
model_json = json_parse.dumps(doc_analyze(pdf_data, ocr=False, show_log=True))
# 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
model_json = "[]"
else:
model_json = read_fn(model_path).decode("utf-8")
else:
......
......@@ -33,6 +33,13 @@ class AbsPipe(ABC):
"""
raise NotImplementedError
@abstractmethod
def pipe_analyze(self):
"""
有状态的跑模型分析
"""
raise NotImplementedError
@abstractmethod
def pipe_parse(self):
"""
......
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_ocr_pdf
......@@ -13,6 +13,9 @@ class OCRPipe(AbsPipe):
def pipe_classify(self):
pass
def pipe_analyze(self):
self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
......
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.AbsPipe import AbsPipe
......@@ -13,6 +14,9 @@ class TXTPipe(AbsPipe):
def pipe_classify(self):
pass
def pipe_analyze(self):
self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
......
......@@ -3,6 +3,7 @@ import json
from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.libs.commons import join_path
......@@ -15,14 +16,24 @@ class UNIPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False):
self.pdf_type = jso_useful_key["_pdf_type"]
super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug)
if len(self.model_list) == 0:
self.input_model_is_empty = True
else:
self.input_model_is_empty = False
def pipe_classify(self):
self.pdf_type = AbsPipe.classify(self.pdf_bytes)
def pipe_analyze(self):
if self.pdf_type == self.PIP_TXT:
self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
elif self.pdf_type == self.PIP_OCR:
self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
def pipe_parse(self):
if self.pdf_type == self.PIP_TXT:
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug)
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty)
elif self.pdf_type == self.PIP_OCR:
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug)
......
......@@ -16,6 +16,7 @@ import re
from loguru import logger
from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
......@@ -65,6 +66,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
input_model_is_empty: bool = False,
*args, **kwargs):
"""
ocr和文本混合的pdf,全部解析出来
......@@ -119,6 +121,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
):
logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
if input_model_is_empty:
pdf_models = doc_analyze(pdf_bytes, ocr=True)
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment