Commit 06063014 authored by 赵小蒙's avatar 赵小蒙

make paddle analyze mode adaptation cli input mode to improve analyze speed

parent 39b46ea9
...@@ -87,6 +87,11 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, ...@@ -87,6 +87,11 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
sys.exit(1) sys.exit(1)
pipe.pipe_classify() pipe.pipe_classify()
'''如果没有传入有效的模型数据,则使用内置paddle解析'''
if len(model_list) == 0:
pipe.pipe_analyze()
pipe.pipe_parse() pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data['pdf_info'] pdf_info = pipe.pdf_mid_data['pdf_info']
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir) draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
...@@ -255,8 +260,8 @@ def pdf_command(pdf, model, method): ...@@ -255,8 +260,8 @@ def pdf_command(pdf, model, method):
model_path = pdf.replace(".pdf", ".json") model_path = pdf.replace(".pdf", ".json")
if not os.path.exists(model_path): if not os.path.exists(model_path):
logger.warning(f"not found json {model_path} existed, use paddle analyze") logger.warning(f"not found json {model_path} existed, use paddle analyze")
# 本地无模型数据则调用内置paddle分析 # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
model_json = json_parse.dumps(doc_analyze(pdf_data, ocr=False, show_log=True)) model_json = "[]"
else: else:
model_json = read_fn(model_path).decode("utf-8") model_json = read_fn(model_path).decode("utf-8")
else: else:
......
...@@ -33,6 +33,13 @@ class AbsPipe(ABC): ...@@ -33,6 +33,13 @@ class AbsPipe(ABC):
""" """
raise NotImplementedError raise NotImplementedError
@abstractmethod
def pipe_analyze(self):
"""
有状态的跑模型分析
"""
raise NotImplementedError
@abstractmethod @abstractmethod
def pipe_parse(self): def pipe_parse(self):
""" """
......
from magic_pdf.libs.MakeContentConfig import DropMode from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.AbsPipe import AbsPipe from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_ocr_pdf from magic_pdf.user_api import parse_ocr_pdf
...@@ -13,6 +13,9 @@ class OCRPipe(AbsPipe): ...@@ -13,6 +13,9 @@ class OCRPipe(AbsPipe):
def pipe_classify(self): def pipe_classify(self):
pass pass
def pipe_analyze(self):
self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
def pipe_parse(self): def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug) self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
......
from magic_pdf.libs.MakeContentConfig import DropMode from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.AbsPipe import AbsPipe from magic_pdf.pipe.AbsPipe import AbsPipe
...@@ -13,6 +14,9 @@ class TXTPipe(AbsPipe): ...@@ -13,6 +14,9 @@ class TXTPipe(AbsPipe):
def pipe_classify(self): def pipe_classify(self):
pass pass
def pipe_analyze(self):
self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
def pipe_parse(self): def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug) self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
......
...@@ -3,6 +3,7 @@ import json ...@@ -3,6 +3,7 @@ import json
from loguru import logger from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
...@@ -15,14 +16,24 @@ class UNIPipe(AbsPipe): ...@@ -15,14 +16,24 @@ class UNIPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False): def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False):
self.pdf_type = jso_useful_key["_pdf_type"] self.pdf_type = jso_useful_key["_pdf_type"]
super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug) super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug)
if len(self.model_list) == 0:
self.input_model_is_empty = True
else:
self.input_model_is_empty = False
def pipe_classify(self): def pipe_classify(self):
self.pdf_type = AbsPipe.classify(self.pdf_bytes) self.pdf_type = AbsPipe.classify(self.pdf_bytes)
def pipe_analyze(self):
if self.pdf_type == self.PIP_TXT:
self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
elif self.pdf_type == self.PIP_OCR:
self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
def pipe_parse(self): def pipe_parse(self):
if self.pdf_type == self.PIP_TXT: if self.pdf_type == self.PIP_TXT:
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer, self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug) is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty)
elif self.pdf_type == self.PIP_OCR: elif self.pdf_type == self.PIP_OCR:
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug) is_debug=self.is_debug)
......
...@@ -16,6 +16,7 @@ import re ...@@ -16,6 +16,7 @@ import re
from loguru import logger from loguru import logger
from magic_pdf.libs.version import __version__ from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.rw import AbsReaderWriter from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
...@@ -65,6 +66,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit ...@@ -65,6 +66,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
input_model_is_empty: bool = False,
*args, **kwargs): *args, **kwargs):
""" """
ocr和文本混合的pdf,全部解析出来 ocr和文本混合的pdf,全部解析出来
...@@ -119,6 +121,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr ...@@ -119,6 +121,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02 or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
): ):
logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr") logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
if input_model_is_empty:
pdf_models = doc_analyze(pdf_bytes, ocr=True)
pdf_info_dict = parse_pdf(parse_pdf_by_ocr) pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None: if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.") raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment