Commit 1f45e0ab authored by kernel.h@qq.com's avatar kernel.h@qq.com

添加debug模式

parent f702defe
......@@ -35,7 +35,6 @@ from magic_pdf.libs.path_utils import (
from magic_pdf.libs.config_reader import get_local_dir
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter, MODE_BIN, MODE_TXT
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
......@@ -54,7 +53,7 @@ def prepare_env():
def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir):
uni_pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir)
uni_pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
jso_useful_key = {
"_pdf_type": "txt",
"model_list": model_list,
......
......@@ -17,6 +17,11 @@ class MagicModel():
def get_imgs(self, page_no:int): # @许瑞
return_lst = []
image_block = {
}
img = {
"bbox":[x0,y0,x1,y1]
}
......@@ -24,10 +29,16 @@ class MagicModel():
"bbox":[x0,y0,x1,y1],
"text":"",
}
return [{"img":img, "caption":img_caption},]
image_block['bbox'] = [x0, y0, x1, y1]# 计算出来
image_block['img_body'] = img
image_blcok['img_caption'] = img_caption
return [image_block,]
def get_tables(self, page_no:int) ->list: # 3个坐标, caption, table主体,table-note
pass # 许瑞
pass # 许瑞, 结构和image一样
def get_equations(self, page_no:int)->list: # 有坐标,也有字
return inline_equations, interline_equations # @凯文
......
......@@ -549,7 +549,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
if "Table" in first_line_text or "Figure" in first_line_text:
pass
if debug_mode:
logger.info(line_hi.std())
logger.debug(line_hi.std())
if line_hi.std()<2:
"""行高度相同,那么判断是否居中"""
......@@ -562,7 +562,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
merge_para = [l[0] for l in layout_para[start:end+1]]
para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
if debug_mode:
logger.info(para_text)
logger.debug(para_text)
layout_para[start:end+1] = [merge_para]
index_offset -= end-start
......
......@@ -16,12 +16,13 @@ class AbsPipe(ABC):
PIP_OCR = "ocr"
PIP_TXT = "txt"
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path:str, ):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path:str, is_debug:bool=False):
self.pdf_bytes = pdf_bytes
self.model_list = model_list
self.image_writer = image_writer
self.img_parent_path = img_parent_path
self.pdf_mid_data = None # 未压缩
self.is_debug = is_debug
def get_compress_pdf_mid_data(self):
return JsonCompressor.compress_json(self.pdf_mid_data)
......
......@@ -6,14 +6,14 @@ from magic_pdf.user_api import parse_ocr_pdf
class OCRPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
def pipe_classify(self):
pass
def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer)
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
......
......@@ -6,14 +6,14 @@ from magic_pdf.user_api import parse_txt_pdf
class TXTPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
def pipe_classify(self):
pass
def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer)
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
......
......@@ -15,18 +15,18 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class UNIPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug: bool = False):
self.pdf_type = self.PIP_OCR
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
def pipe_classify(self):
self.pdf_type = UNIPipe.classify(self.pdf_bytes)
def pipe_parse(self):
if self.pdf_type == self.PIP_TXT:
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer)
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
elif self.pdf_type == self.PIP_OCR:
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer)
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment