Unverified Commit c96aa88d authored by myhloli's avatar myhloli Committed by GitHub

Merge pull request #119 from icecraft/feat/parallel_paddle

feat: parallelize paddle
parents 084dc22a 738f9274
...@@ -7,6 +7,7 @@ from PIL import Image ...@@ -7,6 +7,7 @@ from PIL import Image
from loguru import logger from loguru import logger
import numpy as np import numpy as np
def region_to_bbox(region): def region_to_bbox(region):
x0 = region[0][0] x0 = region[0][0]
y0 = region[0][1] y0 = region[0][1]
...@@ -22,12 +23,14 @@ def dict_compare(d1, d2): ...@@ -22,12 +23,14 @@ def dict_compare(d1, d2):
def remove_duplicates_dicts(lst): def remove_duplicates_dicts(lst):
unique_dicts = [] unique_dicts = []
for dict_item in lst: for dict_item in lst:
if not any(dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts): if not any(
dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
):
unique_dicts.append(dict_item) unique_dicts.append(dict_item)
return unique_dicts return unique_dicts
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
ocr_engine = PPStructure(table=False, ocr=ocr, show_log=show_log)
def load_imags_from_pdf(pdf_bytes: bytes, dpi=200):
imgs = [] imgs = []
with fitz.open("pdf", pdf_bytes) as doc: with fitz.open("pdf", pdf_bytes) as doc:
for index in range(0, doc.page_count): for index in range(0, doc.page_count):
...@@ -42,23 +45,20 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False): ...@@ -42,23 +45,20 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img_dict = { img_dict = {"img": img, "width": pm.width, "height": pm.height}
"img": img,
"width": pm.width,
"height": pm.height
}
imgs.append(img_dict) imgs.append(img_dict)
model_json = []
for index, img_dict in enumerate(imgs): class CustomPaddleModel:
img = img_dict['img'] def __init___(self, ocr: bool = False, show_log: bool = False):
page_width = img_dict['width'] self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
page_height = img_dict['height']
result = ocr_engine(img) def __call__(self, img):
result = self.model(img)
spans = [] spans = []
for line in result: for line in result:
line.pop('img') line.pop("img")
''' """
为paddle输出适配type no. 为paddle输出适配type no.
title: 0 # 标题 title: 0 # 标题
text: 1 # 文本 text: 1 # 文本
...@@ -71,37 +71,38 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False): ...@@ -71,37 +71,38 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
figure_caption: 4 # 图片描述 figure_caption: 4 # 图片描述
table: 5 # 表格 table: 5 # 表格
table_caption: 6 # 表格描述 table_caption: 6 # 表格描述
''' """
if line['type'] == 'title': if line["type"] == "title":
line['category_id'] = 0 line["category_id"] = 0
elif line['type'] in ['text', 'reference']: elif line["type"] in ["text", "reference"]:
line['category_id'] = 1 line["category_id"] = 1
elif line['type'] == 'figure': elif line["type"] == "figure":
line['category_id'] = 3 line["category_id"] = 3
elif line['type'] == 'figure_caption': elif line["type"] == "figure_caption":
line['category_id'] = 4 line["category_id"] = 4
elif line['type'] == 'table': elif line["type"] == "table":
line['category_id'] = 5 line["category_id"] = 5
elif line['type'] == 'table_caption': elif line["type"] == "table_caption":
line['category_id'] = 6 line["category_id"] = 6
elif line['type'] == 'equation': elif line["type"] == "equation":
line['category_id'] = 8 line["category_id"] = 8
elif line['type'] in ['header', 'footer']: elif line["type"] in ["header", "footer"]:
line['category_id'] = 2 line["category_id"] = 2
else: else:
logger.warning(f"unknown type: {line['type']}") logger.warning(f"unknown type: {line['type']}")
# 兼容不输出score的paddleocr版本 # 兼容不输出score的paddleocr版本
if line.get("score") is None: if line.get("score") is None:
line['score'] = 0.5 + random.random() * 0.5 line["score"] = 0.5 + random.random() * 0.5
res = line.pop('res', None) res = line.pop("res", None)
if res is not None and len(res) > 0: if res is not None and len(res) > 0:
for span in res: for span in res:
new_span = {'category_id': 15, new_span = {
'bbox': region_to_bbox(span['text_region']), "category_id": 15,
'score': span['confidence'], "bbox": region_to_bbox(span["text_region"]),
'text': span['text'] "score": span["confidence"],
"text": span["text"],
} }
spans.append(new_span) spans.append(new_span)
...@@ -109,16 +110,21 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False): ...@@ -109,16 +110,21 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
result.extend(spans) result.extend(spans)
result = remove_duplicates_dicts(result) result = remove_duplicates_dicts(result)
return result
page_info = {
"page_no": index, def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
"height": page_height, imgs = load_imags_from_pdf(pdf_bytes)
"width": page_width custom_paddle = CustomPaddleModel()
}
page_dict = { model_json = []
"layout_dets": result, for index, img_dict in enumerate(imgs):
"page_info": page_info img = img_dict["img"]
} page_width = img_dict["width"]
page_height = img_dict["height"]
result = custom_paddle(img)
page_info = {"page_no": index, "height": page_height, "width": page_width}
page_dict = {"layout_dets": result, "page_info": page_info}
model_json.append(page_dict) model_json.append(page_dict)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment