Commit 26c23782 authored by 赵小蒙's avatar 赵小蒙

ocr模式下content type 抽象

parent b6f051d8
from magic_pdf.libs.ocr_content_type import ContentType
def mk_nlp_markdown(pdf_info_dict: dict):
markdown = []
......@@ -12,9 +15,9 @@ def mk_nlp_markdown(pdf_info_dict: dict):
if not span.get('content'):
continue
content = span['content'].replace('$', '\$') # 转义$
if span['type'] == 'inline_equation':
if span['type'] == ContentType.InlineEquation:
content = f"${content}$"
elif span['type'] == 'displayed_equation':
elif span['type'] == ContentType.InterlineEquation:
content = f"$$\n{content}\n$$"
line_text += content + ' '
# 在行末添加两个空格以强制换行
......@@ -41,9 +44,9 @@ def mk_mm_markdown(pdf_info_dict: dict):
content = f"![]({span['image_path']})"
else:
content = span['content'].replace('$', '\$') # 转义$
if span['type'] == 'inline_equation':
if span['type'] == ContentType.InlineEquation:
content = f"${content}$"
elif span['type'] == 'displayed_equation':
elif span['type'] == ContentType.InterlineEquation:
content = f"$$\n{content}\n$$"
line_text += content + ' '
# 在行末添加两个空格以强制换行
......
from magic_pdf.libs.commons import fitz # PyMuPDF
from magic_pdf.libs.ocr_content_type import ContentType
def draw_bbox_without_number(i, bbox_list, page, rgb_config):
new_rgb = []
......@@ -49,30 +51,30 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
def draw_text_bbox(pdf_info_dict, input_path, out_path):
text_list = []
inline_equation_list = []
displayed_equation_list = []
interline_equation_list = []
for page in pdf_info_dict.values():
page_text_list = []
page_inline_equation_list = []
page_displayed_equation_list = []
page_interline_equation_list = []
for block in page['preproc_blocks']:
for line in block['lines']:
for span in line['spans']:
if span['type'] == 'text':
if span['type'] == ContentType.Text:
page_text_list.append(span['bbox'])
elif span['type'] == 'inline_equation':
elif span['type'] == ContentType.InlineEquation:
page_inline_equation_list.append(span['bbox'])
elif span['type'] == 'displayed_equation':
page_displayed_equation_list.append(span['bbox'])
elif span['type'] == ContentType.InterlineEquation:
page_interline_equation_list.append(span['bbox'])
text_list.append(page_text_list)
inline_equation_list.append(page_inline_equation_list)
displayed_equation_list.append(page_displayed_equation_list)
interline_equation_list.append(page_interline_equation_list)
doc = fitz.open(input_path)
for i, page in enumerate(doc):
# 获取当前页面的数据
draw_bbox_without_number(i, text_list, page, [255, 0, 0])
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
draw_bbox_without_number(i, displayed_equation_list, page, [0, 0, 255])
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255])
# Save the PDF
doc.save(f"{out_path}/text.pdf")
class ContentType:
Image = "image"
Table = "table"
Text = "text"
InlineEquation = "inline_equation"
InterlineEquation = "interline_equation"
......@@ -14,6 +14,7 @@ from magic_pdf.libs.commons import (
get_docx_model_output,
)
from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.safe_filename import sanitize_filename
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
......@@ -44,10 +45,10 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
'tables': tables,
'interline_equations': interline_equations,
'inline_equations': inline_equations,
'dropped_text_block': dropped_text_block,
'dropped_image_block': dropped_image_block,
'dropped_table_block': dropped_table_block,
'dropped_bboxes': need_remove_spans_bboxes_dict,
'droped_text_block': dropped_text_block,
'droped_image_block': dropped_image_block,
'droped_table_block': dropped_table_block,
'droped_bboxes': need_remove_spans_bboxes_dict,
}
return return_dict
......@@ -164,7 +165,7 @@ def parse_pdf_by_ocr(
# 1: 'image', # 图片
# 7: 'table', # 表格
# 13: 'inline_equation', # 行内公式
# 14: 'displayed_equation', # 行间公式
# 14: 'interline_equation', # 行间公式
# 15: 'text', # ocr识别文本
"""layout信息"""
# 11: 'full column', # 单栏
......@@ -173,20 +174,20 @@ def parse_pdf_by_ocr(
"bbox": bbox,
}
if category_id == 1:
span["type"] = "image"
span["type"] = ContentType.Image
elif category_id == 7:
span["type"] = "table"
span["type"] = ContentType.Table
elif category_id == 13:
span["content"] = layout_det["latex"]
span["type"] = "inline_equation"
span["type"] = ContentType.InlineEquation
elif category_id == 14:
span["content"] = layout_det["latex"]
span["type"] = "displayed_equation"
span["type"] = ContentType.InterlineEquation
elif category_id == 15:
span["content"] = layout_det["text"]
span["type"] = "text"
span["type"] = ContentType.Text
# print(span)
spans.append(span)
else:
......@@ -213,7 +214,7 @@ def parse_pdf_by_ocr(
# bbox去除粘连
spans = remove_overlap_between_bbox(spans)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
spans = adjust_bbox_for_standalone_block(spans)
......
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.pdf_image_tools import cut_image
......@@ -11,9 +12,9 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
for span in spans:
span_type = span['type']
if span_type == 'image':
if span_type == ContentType.Image:
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
elif span_type == 'table':
elif span_type == ContentType.Table:
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
return spans
......@@ -2,6 +2,7 @@ from loguru import logger
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
calculate_overlap_area_in_bbox1_area_ratio
from magic_pdf.libs.ocr_content_type import ContentType
# 将每一个line中的span从左到右排序
......@@ -29,10 +30,10 @@ def merge_spans_to_line(spans):
lines = []
current_line = [spans[0]]
for span in spans[1:]:
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if span['type'] in ["displayed_equation", "image", "table"] or any(
s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
# 则开始新行
lines.append(current_line)
current_line = [span]
......
......@@ -2,6 +2,7 @@ from loguru import logger
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
__is_overlaps_y_exceeds_threshold
from magic_pdf.libs.ocr_content_type import ContentType
def remove_overlaps_min_spans(spans):
......@@ -49,22 +50,22 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
for span in need_remove_spans:
spans.remove(span)
span['tag'] = drop_tag
if span['type'] in ['text', 'inline_equation', 'displayed_equation']:
if span['type'] in [ContentType.Text, ContentType.InlineEquation, ContentType.InterlineEquation]:
dropped_text_block.append(span)
elif span['type'] == 'image':
elif span['type'] == ContentType.Image:
dropped_image_block.append(span)
elif span['type'] == 'table':
elif span['type'] == ContentType.Table:
dropped_table_block.append(span)
return spans, dropped_text_block, dropped_image_block, dropped_table_block
def adjust_bbox_for_standalone_block(spans):
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
for sb_span in spans:
if sb_span['type'] in ["displayed_equation", "image", "table"]:
if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
for text_span in spans:
if text_span['type'] in ['text', 'inline_equation']:
if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
# 判断span2的纵向高度是否被span所覆盖
if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
# 判断span2是否在span左边
......@@ -81,7 +82,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
lines = []
current_line = [spans[0]]
if spans[0]["type"] in ["displayed_equation", "image", "table"]:
if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
displayed_list.append(spans[0])
line_first_y0 = spans[0]["bbox"][1]
......@@ -91,16 +92,16 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
for span in spans[1:]:
# if span.get("content","") == "78.":
# print("debug")
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if span['type'] in ["displayed_equation", "image", "table"] or any(
s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
# 传入
if span["type"] in ["displayed_equation", "image", "table"]:
if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
displayed_list.append(span)
# 则开始新行
lines.append(current_line)
if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
current_line = [span]
line_first_y0 = span["bbox"][1]
......@@ -125,7 +126,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
# 添加最后一行
if current_line:
lines.append(current_line)
if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
for line in text_inline_lines:
# 按照x0坐标排序
......@@ -159,10 +160,10 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
span['bbox'], (0, y0, 0, y1)):
# 调整公式类型
if span["type"] == "displayed_equation":
if span["type"] == ContentType.InterlineEquation:
# 最后一行是行间公式
if j + 1 >= len(text_inline_lines):
span["type"] = "inline_equation"
span["type"] = ContentType.InlineEquation
span["bbox"][1] = y0
span["bbox"][3] = y1
else:
......@@ -170,7 +171,7 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
y0_next, y1_next = text_inline_lines[j + 1][1]
if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
y1 - y0) > span_y - span_y0:
span["type"] = "inline_equation"
span["type"] = ContentType.InlineEquation
span["bbox"][1] = y0
span["bbox"][3] = y1
break
......@@ -193,13 +194,13 @@ def get_qa_need_list(blocks):
for block in blocks:
for line in block["lines"]:
for span in line["spans"]:
if span["type"] == "image":
if span["type"] == ContentType.Image:
images.append(span)
elif span["type"] == "table":
elif span["type"] == ContentType.Table:
tables.append(span)
elif span["type"] == "inline_equation":
elif span["type"] == ContentType.InlineEquation:
inline_equations.append(span)
elif span["type"] == "displayed_equation":
elif span["type"] == ContentType.InterlineEquation:
interline_equations.append(span)
else:
continue
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment