Commit 26c23782 authored by 赵小蒙's avatar 赵小蒙

ocr模式下content type 抽象

parent b6f051d8
from magic_pdf.libs.ocr_content_type import ContentType
def mk_nlp_markdown(pdf_info_dict: dict): def mk_nlp_markdown(pdf_info_dict: dict):
markdown = [] markdown = []
...@@ -12,9 +15,9 @@ def mk_nlp_markdown(pdf_info_dict: dict): ...@@ -12,9 +15,9 @@ def mk_nlp_markdown(pdf_info_dict: dict):
if not span.get('content'): if not span.get('content'):
continue continue
content = span['content'].replace('$', '\$') # 转义$ content = span['content'].replace('$', '\$') # 转义$
if span['type'] == 'inline_equation': if span['type'] == ContentType.InlineEquation:
content = f"${content}$" content = f"${content}$"
elif span['type'] == 'displayed_equation': elif span['type'] == ContentType.InterlineEquation:
content = f"$$\n{content}\n$$" content = f"$$\n{content}\n$$"
line_text += content + ' ' line_text += content + ' '
# 在行末添加两个空格以强制换行 # 在行末添加两个空格以强制换行
...@@ -41,9 +44,9 @@ def mk_mm_markdown(pdf_info_dict: dict): ...@@ -41,9 +44,9 @@ def mk_mm_markdown(pdf_info_dict: dict):
content = f"![]({span['image_path']})" content = f"![]({span['image_path']})"
else: else:
content = span['content'].replace('$', '\$') # 转义$ content = span['content'].replace('$', '\$') # 转义$
if span['type'] == 'inline_equation': if span['type'] == ContentType.InlineEquation:
content = f"${content}$" content = f"${content}$"
elif span['type'] == 'displayed_equation': elif span['type'] == ContentType.InterlineEquation:
content = f"$$\n{content}\n$$" content = f"$$\n{content}\n$$"
line_text += content + ' ' line_text += content + ' '
# 在行末添加两个空格以强制换行 # 在行末添加两个空格以强制换行
......
from magic_pdf.libs.commons import fitz # PyMuPDF from magic_pdf.libs.commons import fitz # PyMuPDF
from magic_pdf.libs.ocr_content_type import ContentType
def draw_bbox_without_number(i, bbox_list, page, rgb_config): def draw_bbox_without_number(i, bbox_list, page, rgb_config):
new_rgb = [] new_rgb = []
...@@ -49,30 +51,30 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path): ...@@ -49,30 +51,30 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
def draw_text_bbox(pdf_info_dict, input_path, out_path): def draw_text_bbox(pdf_info_dict, input_path, out_path):
text_list = [] text_list = []
inline_equation_list = [] inline_equation_list = []
displayed_equation_list = [] interline_equation_list = []
for page in pdf_info_dict.values(): for page in pdf_info_dict.values():
page_text_list = [] page_text_list = []
page_inline_equation_list = [] page_inline_equation_list = []
page_displayed_equation_list = [] page_interline_equation_list = []
for block in page['preproc_blocks']: for block in page['preproc_blocks']:
for line in block['lines']: for line in block['lines']:
for span in line['spans']: for span in line['spans']:
if span['type'] == 'text': if span['type'] == ContentType.Text:
page_text_list.append(span['bbox']) page_text_list.append(span['bbox'])
elif span['type'] == 'inline_equation': elif span['type'] == ContentType.InlineEquation:
page_inline_equation_list.append(span['bbox']) page_inline_equation_list.append(span['bbox'])
elif span['type'] == 'displayed_equation': elif span['type'] == ContentType.InterlineEquation:
page_displayed_equation_list.append(span['bbox']) page_interline_equation_list.append(span['bbox'])
text_list.append(page_text_list) text_list.append(page_text_list)
inline_equation_list.append(page_inline_equation_list) inline_equation_list.append(page_inline_equation_list)
displayed_equation_list.append(page_displayed_equation_list) interline_equation_list.append(page_interline_equation_list)
doc = fitz.open(input_path) doc = fitz.open(input_path)
for i, page in enumerate(doc): for i, page in enumerate(doc):
# 获取当前页面的数据 # 获取当前页面的数据
draw_bbox_without_number(i, text_list, page, [255, 0, 0]) draw_bbox_without_number(i, text_list, page, [255, 0, 0])
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0]) draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
draw_bbox_without_number(i, displayed_equation_list, page, [0, 0, 255]) draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255])
# Save the PDF # Save the PDF
doc.save(f"{out_path}/text.pdf") doc.save(f"{out_path}/text.pdf")
class ContentType:
Image = "image"
Table = "table"
Text = "text"
InlineEquation = "inline_equation"
InterlineEquation = "interline_equation"
...@@ -14,6 +14,7 @@ from magic_pdf.libs.commons import ( ...@@ -14,6 +14,7 @@ from magic_pdf.libs.commons import (
get_docx_model_output, get_docx_model_output,
) )
from magic_pdf.libs.coordinate_transform import get_scale_ratio from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.safe_filename import sanitize_filename from magic_pdf.libs.safe_filename import sanitize_filename
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
...@@ -44,10 +45,10 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay ...@@ -44,10 +45,10 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
'tables': tables, 'tables': tables,
'interline_equations': interline_equations, 'interline_equations': interline_equations,
'inline_equations': inline_equations, 'inline_equations': inline_equations,
'dropped_text_block': dropped_text_block, 'droped_text_block': dropped_text_block,
'dropped_image_block': dropped_image_block, 'droped_image_block': dropped_image_block,
'dropped_table_block': dropped_table_block, 'droped_table_block': dropped_table_block,
'dropped_bboxes': need_remove_spans_bboxes_dict, 'droped_bboxes': need_remove_spans_bboxes_dict,
} }
return return_dict return return_dict
...@@ -164,7 +165,7 @@ def parse_pdf_by_ocr( ...@@ -164,7 +165,7 @@ def parse_pdf_by_ocr(
# 1: 'image', # 图片 # 1: 'image', # 图片
# 7: 'table', # 表格 # 7: 'table', # 表格
# 13: 'inline_equation', # 行内公式 # 13: 'inline_equation', # 行内公式
# 14: 'displayed_equation', # 行间公式 # 14: 'interline_equation', # 行间公式
# 15: 'text', # ocr识别文本 # 15: 'text', # ocr识别文本
"""layout信息""" """layout信息"""
# 11: 'full column', # 单栏 # 11: 'full column', # 单栏
...@@ -173,20 +174,20 @@ def parse_pdf_by_ocr( ...@@ -173,20 +174,20 @@ def parse_pdf_by_ocr(
"bbox": bbox, "bbox": bbox,
} }
if category_id == 1: if category_id == 1:
span["type"] = "image" span["type"] = ContentType.Image
elif category_id == 7: elif category_id == 7:
span["type"] = "table" span["type"] = ContentType.Table
elif category_id == 13: elif category_id == 13:
span["content"] = layout_det["latex"] span["content"] = layout_det["latex"]
span["type"] = "inline_equation" span["type"] = ContentType.InlineEquation
elif category_id == 14: elif category_id == 14:
span["content"] = layout_det["latex"] span["content"] = layout_det["latex"]
span["type"] = "displayed_equation" span["type"] = ContentType.InterlineEquation
elif category_id == 15: elif category_id == 15:
span["content"] = layout_det["text"] span["content"] = layout_det["text"]
span["type"] = "text" span["type"] = ContentType.Text
# print(span) # print(span)
spans.append(span) spans.append(span)
else: else:
...@@ -213,7 +214,7 @@ def parse_pdf_by_ocr( ...@@ -213,7 +214,7 @@ def parse_pdf_by_ocr(
# bbox去除粘连 # bbox去除粘连
spans = remove_overlap_between_bbox(spans) spans = remove_overlap_between_bbox(spans)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0 # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
spans = adjust_bbox_for_standalone_block(spans) spans = adjust_bbox_for_standalone_block(spans)
......
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.pdf_image_tools import cut_image from magic_pdf.libs.pdf_image_tools import cut_image
...@@ -11,9 +12,9 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path): ...@@ -11,9 +12,9 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
for span in spans: for span in spans:
span_type = span['type'] span_type = span['type']
if span_type == 'image': if span_type == ContentType.Image:
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images')) span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
elif span_type == 'table': elif span_type == ContentType.Table:
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables')) span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
return spans return spans
...@@ -2,6 +2,7 @@ from loguru import logger ...@@ -2,6 +2,7 @@ from loguru import logger
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
calculate_overlap_area_in_bbox1_area_ratio calculate_overlap_area_in_bbox1_area_ratio
from magic_pdf.libs.ocr_content_type import ContentType
# 将每一个line中的span从左到右排序 # 将每一个line中的span从左到右排序
...@@ -29,10 +30,10 @@ def merge_spans_to_line(spans): ...@@ -29,10 +30,10 @@ def merge_spans_to_line(spans):
lines = [] lines = []
current_line = [spans[0]] current_line = [spans[0]]
for span in spans[1:]: for span in spans[1:]:
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation" # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上 # image和table类型,同上
if span['type'] in ["displayed_equation", "image", "table"] or any( if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
s['type'] in ["displayed_equation", "image", "table"] for s in current_line): s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
# 则开始新行 # 则开始新行
lines.append(current_line) lines.append(current_line)
current_line = [span] current_line = [span]
......
...@@ -2,6 +2,7 @@ from loguru import logger ...@@ -2,6 +2,7 @@ from loguru import logger
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \ from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
__is_overlaps_y_exceeds_threshold __is_overlaps_y_exceeds_threshold
from magic_pdf.libs.ocr_content_type import ContentType
def remove_overlaps_min_spans(spans): def remove_overlaps_min_spans(spans):
...@@ -49,22 +50,22 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict): ...@@ -49,22 +50,22 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
for span in need_remove_spans: for span in need_remove_spans:
spans.remove(span) spans.remove(span)
span['tag'] = drop_tag span['tag'] = drop_tag
if span['type'] in ['text', 'inline_equation', 'displayed_equation']: if span['type'] in [ContentType.Text, ContentType.InlineEquation, ContentType.InterlineEquation]:
dropped_text_block.append(span) dropped_text_block.append(span)
elif span['type'] == 'image': elif span['type'] == ContentType.Image:
dropped_image_block.append(span) dropped_image_block.append(span)
elif span['type'] == 'table': elif span['type'] == ContentType.Table:
dropped_table_block.append(span) dropped_table_block.append(span)
return spans, dropped_text_block, dropped_image_block, dropped_table_block return spans, dropped_text_block, dropped_image_block, dropped_table_block
def adjust_bbox_for_standalone_block(spans): def adjust_bbox_for_standalone_block(spans):
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0 # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
for sb_span in spans: for sb_span in spans:
if sb_span['type'] in ["displayed_equation", "image", "table"]: if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
for text_span in spans: for text_span in spans:
if text_span['type'] in ['text', 'inline_equation']: if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
# 判断span2的纵向高度是否被span所覆盖 # 判断span2的纵向高度是否被span所覆盖
if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]: if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
# 判断span2是否在span左边 # 判断span2是否在span左边
...@@ -81,7 +82,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list): ...@@ -81,7 +82,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
lines = [] lines = []
current_line = [spans[0]] current_line = [spans[0]]
if spans[0]["type"] in ["displayed_equation", "image", "table"]: if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
displayed_list.append(spans[0]) displayed_list.append(spans[0])
line_first_y0 = spans[0]["bbox"][1] line_first_y0 = spans[0]["bbox"][1]
...@@ -91,16 +92,16 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list): ...@@ -91,16 +92,16 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
for span in spans[1:]: for span in spans[1:]:
# if span.get("content","") == "78.": # if span.get("content","") == "78.":
# print("debug") # print("debug")
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation" # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上 # image和table类型,同上
if span['type'] in ["displayed_equation", "image", "table"] or any( if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
s['type'] in ["displayed_equation", "image", "table"] for s in current_line): s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
# 传入 # 传入
if span["type"] in ["displayed_equation", "image", "table"]: if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
displayed_list.append(span) displayed_list.append(span)
# 则开始新行 # 则开始新行
lines.append(current_line) lines.append(current_line)
if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]: if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y))) text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
current_line = [span] current_line = [span]
line_first_y0 = span["bbox"][1] line_first_y0 = span["bbox"][1]
...@@ -125,7 +126,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list): ...@@ -125,7 +126,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
# 添加最后一行 # 添加最后一行
if current_line: if current_line:
lines.append(current_line) lines.append(current_line)
if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]: if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y))) text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
for line in text_inline_lines: for line in text_inline_lines:
# 按照x0坐标排序 # 按照x0坐标排序
...@@ -159,10 +160,10 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: ...@@ -159,10 +160,10 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
span['bbox'], (0, y0, 0, y1)): span['bbox'], (0, y0, 0, y1)):
# 调整公式类型 # 调整公式类型
if span["type"] == "displayed_equation": if span["type"] == ContentType.InterlineEquation:
# 最后一行是行间公式 # 最后一行是行间公式
if j + 1 >= len(text_inline_lines): if j + 1 >= len(text_inline_lines):
span["type"] = "inline_equation" span["type"] = ContentType.InlineEquation
span["bbox"][1] = y0 span["bbox"][1] = y0
span["bbox"][3] = y1 span["bbox"][3] = y1
else: else:
...@@ -170,7 +171,7 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: ...@@ -170,7 +171,7 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
y0_next, y1_next = text_inline_lines[j + 1][1] y0_next, y1_next = text_inline_lines[j + 1][1]
if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * ( if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
y1 - y0) > span_y - span_y0: y1 - y0) > span_y - span_y0:
span["type"] = "inline_equation" span["type"] = ContentType.InlineEquation
span["bbox"][1] = y0 span["bbox"][1] = y0
span["bbox"][3] = y1 span["bbox"][3] = y1
break break
...@@ -193,13 +194,13 @@ def get_qa_need_list(blocks): ...@@ -193,13 +194,13 @@ def get_qa_need_list(blocks):
for block in blocks: for block in blocks:
for line in block["lines"]: for line in block["lines"]:
for span in line["spans"]: for span in line["spans"]:
if span["type"] == "image": if span["type"] == ContentType.Image:
images.append(span) images.append(span)
elif span["type"] == "table": elif span["type"] == ContentType.Table:
tables.append(span) tables.append(span)
elif span["type"] == "inline_equation": elif span["type"] == ContentType.InlineEquation:
inline_equations.append(span) inline_equations.append(span)
elif span["type"] == "displayed_equation": elif span["type"] == ContentType.InterlineEquation:
interline_equations.append(span) interline_equations.append(span)
else: else:
continue continue
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment