Unverified Commit c96aa88d authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #119 from icecraft/feat/parallel_paddle

feat: parallelize paddle
parents 084dc22a 738f9274
......@@ -7,6 +7,7 @@ from PIL import Image
from loguru import logger
import numpy as np
def region_to_bbox(region):
x0 = region[0][0]
y0 = region[0][1]
......@@ -22,12 +23,14 @@ def dict_compare(d1, d2):
def remove_duplicates_dicts(lst):
unique_dicts = []
for dict_item in lst:
if not any(dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts):
if not any(
dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
):
unique_dicts.append(dict_item)
return unique_dicts
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
ocr_engine = PPStructure(table=False, ocr=ocr, show_log=show_log)
def load_imags_from_pdf(pdf_bytes: bytes, dpi=200):
imgs = []
with fitz.open("pdf", pdf_bytes) as doc:
for index in range(0, doc.page_count):
......@@ -42,23 +45,20 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img_dict = {
"img": img,
"width": pm.width,
"height": pm.height
}
img_dict = {"img": img, "width": pm.width, "height": pm.height}
imgs.append(img_dict)
model_json = []
for index, img_dict in enumerate(imgs):
img = img_dict['img']
page_width = img_dict['width']
page_height = img_dict['height']
result = ocr_engine(img)
class CustomPaddleModel:
def __init___(self, ocr: bool = False, show_log: bool = False):
self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
def __call__(self, img):
result = self.model(img)
spans = []
for line in result:
line.pop('img')
'''
line.pop("img")
"""
为paddle输出适配type no.
title: 0 # 标题
text: 1 # 文本
......@@ -71,37 +71,38 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
figure_caption: 4 # 图片描述
table: 5 # 表格
table_caption: 6 # 表格描述
'''
if line['type'] == 'title':
line['category_id'] = 0
elif line['type'] in ['text', 'reference']:
line['category_id'] = 1
elif line['type'] == 'figure':
line['category_id'] = 3
elif line['type'] == 'figure_caption':
line['category_id'] = 4
elif line['type'] == 'table':
line['category_id'] = 5
elif line['type'] == 'table_caption':
line['category_id'] = 6
elif line['type'] == 'equation':
line['category_id'] = 8
elif line['type'] in ['header', 'footer']:
line['category_id'] = 2
"""
if line["type"] == "title":
line["category_id"] = 0
elif line["type"] in ["text", "reference"]:
line["category_id"] = 1
elif line["type"] == "figure":
line["category_id"] = 3
elif line["type"] == "figure_caption":
line["category_id"] = 4
elif line["type"] == "table":
line["category_id"] = 5
elif line["type"] == "table_caption":
line["category_id"] = 6
elif line["type"] == "equation":
line["category_id"] = 8
elif line["type"] in ["header", "footer"]:
line["category_id"] = 2
else:
logger.warning(f"unknown type: {line['type']}")
# 兼容不输出score的paddleocr版本
if line.get("score") is None:
line['score'] = 0.5 + random.random() * 0.5
line["score"] = 0.5 + random.random() * 0.5
res = line.pop('res', None)
res = line.pop("res", None)
if res is not None and len(res) > 0:
for span in res:
new_span = {'category_id': 15,
'bbox': region_to_bbox(span['text_region']),
'score': span['confidence'],
'text': span['text']
new_span = {
"category_id": 15,
"bbox": region_to_bbox(span["text_region"]),
"score": span["confidence"],
"text": span["text"],
}
spans.append(new_span)
......@@ -109,16 +110,21 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
result.extend(spans)
result = remove_duplicates_dicts(result)
return result
page_info = {
"page_no": index,
"height": page_height,
"width": page_width
}
page_dict = {
"layout_dets": result,
"page_info": page_info
}
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
imgs = load_imags_from_pdf(pdf_bytes)
custom_paddle = CustomPaddleModel()
model_json = []
for index, img_dict in enumerate(imgs):
img = img_dict["img"]
page_width = img_dict["width"]
page_height = img_dict["height"]
result = custom_paddle(img)
page_info = {"page_no": index, "height": page_height, "width": page_width}
page_dict = {"layout_dets": result, "page_info": page_info}
model_json.append(page_dict)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment