Commit 2e32ecfe authored by liukaiwen's avatar liukaiwen

Merge branch 'master' of github.com:myhloli/Magic-PDF

# Conflicts:
#	demo/draw_bbox.py
parents 1d59509d f31117de
from magic_pdf.libs.commons import fitz # PyMuPDF
# PDF文件路径
pdf_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_1_org.pdf"
def draw_bbox(i, bbox_list, page, rgb_config):
new_rgb = []
for item in rgb_config:
item = float(item) / 255
new_rgb.append(item)
page_data = bbox_list[i]
for bbox in page_data:
x0, y0, x1, y1 = bbox
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
page.draw_rect(rect_coords, color=new_rgb, fill=None, width=0.5, overlay=True) # Draw the rectangle
doc = fitz.open(pdf_path) # Open the PDF
# 你的数据
data = [[(294.7569528415961, 776.8430953398889, 300.8827085852479, 786.922616502779), (460.1523579201934, 776.8430953398889, 509.51874244256345, 787.2825994014537)], [(294.03627569528413, 779.7229585292861, 301.24304715840384, 788.3625480974777), (85.76058041112454, 781.882855921334, 156.74727932285367, 789.8024796921762)], [(293.6759371221282, 779.7229585292861, 301.60338573155985, 788.7225309961523), (459.43168077388145, 779.7229585292861, 508.7980652962515, 789.8024796921762)], [(295.8379685610641, 780.0829414279607, 301.24304715840384, 788.0025651988029), (85.76058041112454, 781.5228730226593, 156.74727932285367, 790.1624625908509)], [(294.03627569528413, 779.7229585292861, 301.60338573155985, 789.0825138948269), (459.79201934703747, 779.7229585292861, 508.4377267230955, 789.4424967935015)], [(86.4812575574365, 781.882855921334, 156.0266021765417, 789.8024796921762)], [(294.39661426844015, 779.7229585292861, 301.24304715840384, 788.3625480974777), (459.43168077388145, 779.7229585292861, 508.7980652962515, 789.4424967935015)], [(294.03627569528413, 779.7229585292861, 301.24304715840384, 788.3625480974777), (85.76058041112454, 781.5228730226593, 156.74727932285367, 789.8024796921762)], [(294.39661426844015, 779.7229585292861, 300.8827085852479, 788.3625480974777)]]
def draw_layout_bbox(pdf_info_dict, input_path, out_path):
layout_bbox_list = []
for page in pdf_info_dict.values():
page_list = []
for layout in page['layout_bboxes']:
page_list.append(layout['layout_bbox'])
layout_bbox_list.append(page_list)
# 对每个页面进行处理
for i, page in enumerate(doc):
doc = fitz.open(input_path)
for i, page in enumerate(doc):
# 获取当前页面的数据
page_data = data[i]
for img in page_data:
# x0, y0, x1, y1, _ = img
x0, y0, x1, y1 = img
page_data = layout_bbox_list[i]
for j, bbox in enumerate(page_data):
x0, y0, x1, y1 = bbox
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True) # Draw the rectangle
page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=0.5, overlay=True) # Draw the rectangle
page.insert_text((x0, y0), str(j + 1), fontsize=10, color=(1, 0, 0)) # Insert the index at the top left corner of the rectangle
# Save the PDF
doc.save(f"{out_path}/layout.pdf")
def draw_text_bbox(pdf_info_dict, input_path, out_path):
text_list = []
inline_equation_list = []
displayed_equation_list = []
for page in pdf_info_dict.values():
page_text_list = []
page_inline_equation_list = []
page_displayed_equation_list = []
for block in page['preproc_blocks']:
for line in block['lines']:
for span in line['spans']:
if span['type'] == 'text':
page_text_list.append(span['bbox'])
elif span['type'] == 'inline_equation':
page_inline_equation_list.append(span['bbox'])
elif span['type'] == 'displayed_equation':
page_displayed_equation_list.append(span['bbox'])
text_list.append(page_text_list)
inline_equation_list.append(page_inline_equation_list)
displayed_equation_list.append(page_displayed_equation_list)
doc = fitz.open(input_path)
for i, page in enumerate(doc):
# 获取当前页面的数据
draw_bbox(i, text_list, page, [255, 0, 0])
draw_bbox(i, inline_equation_list, page, [0, 255, 0])
draw_bbox(i, displayed_equation_list, page, [0, 0, 255])
# Save the PDF
doc.save(r"D:\projects\Magic-PDF\ocr_demo\ocr_1_new2.pdf")
\ No newline at end of file
# Save the PDF
doc.save(f"{out_path}/text.pdf")
......@@ -4,7 +4,7 @@ import os
from loguru import logger
from pathlib import Path
from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown
from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown, mk_mm_markdown
from magic_pdf.libs.commons import join_path
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
......@@ -30,15 +30,20 @@ def read_json_file(file_path):
if __name__ == '__main__':
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0_org.pdf"
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1.json"
try:
ocr_pdf_model_info = read_json_file(ocr_json_file_path)
pth = Path(ocr_json_file_path)
book_name = pth.name
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
save_path = join_path(save_tmp_path, "md")
text_content_save_path = f"{save_path}/{book_name}/book.md"
save_path_with_bookname = os.path.join(save_path, book_name)
text_content_save_path = f"{save_path_with_bookname}/book.md"
pdf_info_dict = parse_pdf_by_ocr(
ocr_pdf_path,
None,
......@@ -46,11 +51,13 @@ if __name__ == '__main__':
save_path,
book_name,
debug_mode=True)
parent_dir = os.path.dirname(text_content_save_path)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
markdown_content = mk_nlp_markdown(pdf_info_dict)
# markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content = mk_mm_markdown(pdf_info_dict)
with open(text_content_save_path, "w", encoding="utf-8") as f:
f.write(markdown_content)
......
......@@ -21,3 +21,31 @@ def mk_nlp_markdown(pdf_info_dict: dict):
# 在行末添加两个空格以强制换行
markdown.append(line_text.strip() + ' ')
return '\n'.join(markdown)
def mk_mm_markdown(pdf_info_dict: dict):
markdown = []
for _, page_info in pdf_info_dict.items():
blocks = page_info.get("preproc_blocks")
if not blocks:
continue
for block in blocks:
for line in block['lines']:
line_text = ''
for span in line['spans']:
if not span.get('content'):
if not span.get('image_path'):
continue
else:
content = f"![]({span['image_path']})"
else:
content = span['content'].replace('$', '\$') # 转义$
if span['type'] == 'inline_equation':
content = f"${content}$"
elif span['type'] == 'displayed_equation':
content = f"$$\n{content}\n$$"
line_text += content + ' '
# 在行末添加两个空格以强制换行
markdown.append(line_text.strip() + ' ')
return '\n'.join(markdown)
......@@ -4,6 +4,7 @@ import time
from loguru import logger
from demo.draw_bbox import draw_layout_bbox, draw_text_bbox
from magic_pdf.libs.commons import read_file, join_path, fitz, get_img_s3_client, get_delta_time, get_docx_model_output
from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.safe_filename import sanitize_filename
......@@ -185,14 +186,11 @@ def parse_pdf_by_ocr(
# 在测试时,保存调试信息
if debug_mode:
params_file_save_path = join_path(save_tmp_path, "md", book_name, "preproc_out.json")
page_draw_rect_save_path = join_path(save_tmp_path, "md", book_name, "layout.pdf")
with open(params_file_save_path, "w", encoding="utf-8") as f:
json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
# 先检测本地 page_draw_rect_save_path 是否存在,如果存在则删除
if os.path.exists(page_draw_rect_save_path):
os.remove(page_draw_rect_save_path)
# 绘制bbox和layout到pdf
# drow_bbox
draw_layout_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
draw_text_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
return pdf_info_dict
......@@ -12,8 +12,8 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
for span in spans:
span_type = span['type']
if span_type == 'image':
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('image'))
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
elif span_type == 'table':
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('table'))
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
return spans
import fitz
from magic_pdf.layout.layout_sort import get_bboxes_layout
from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
from magic_pdf.libs.coordinate_transform import get_scale_ratio
......@@ -26,23 +27,16 @@ def get_area(bbox):
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
def adjust_layouts(layout_bboxes):
def adjust_layouts(layout_bboxes, page_boundry, page_id):
# 遍历所有布局框
for i in range(len(layout_bboxes)):
# 遍历当前布局框之后的布局框
for j in range(i + 1, len(layout_bboxes)):
# 判断两个布局框是否重叠
if _is_part_overlap(layout_bboxes[i]["layout_bbox"], layout_bboxes[j]["layout_bbox"]):
if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
# 计算每个布局框的中心点坐标和面积
center_i = get_center_point(layout_bboxes[i]["layout_bbox"])
area_i = get_area(layout_bboxes[i]["layout_bbox"])
center_j = get_center_point(layout_bboxes[j]["layout_bbox"])
area_j = get_area(layout_bboxes[j]["layout_bbox"])
# 计算横向和纵向的距离差
dx = abs(center_i[0] - center_j[0])
dy = abs(center_i[1] - center_j[1])
area_i = get_area(layout_bboxes[i])
area_j = get_area(layout_bboxes[j])
# 较大布局框和较小布局框的赋值
if area_i > area_j:
......@@ -50,19 +44,29 @@ def adjust_layouts(layout_bboxes):
else:
larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
center_large = get_center_point(larger_layout)
center_small = get_center_point(smaller_layout)
# 计算横向和纵向的距离差
distance_x = center_large[0] - center_small[0]
distance_y = center_large[1] - center_small[1]
# 根据距离差判断重叠方向并修正边界
if dx > dy: # 左右重叠
if larger_layout["layout_bbox"][0] < smaller_layout["layout_bbox"][2]:
larger_layout["layout_bbox"][0] = smaller_layout["layout_bbox"][2]
else:
larger_layout["layout_bbox"][2] = smaller_layout["layout_bbox"][0]
if abs(distance_x) > abs(distance_y): # 左右重叠
if distance_x > 0 and larger_layout[0] < smaller_layout[2]:
larger_layout[0] = smaller_layout[2]+1
if distance_x < 0 and larger_layout[2] > smaller_layout[0]:
larger_layout[2] = smaller_layout[0]-1
else: # 上下重叠
if larger_layout["layout_bbox"][1] < smaller_layout["layout_bbox"][3]:
larger_layout["layout_bbox"][1] = smaller_layout["layout_bbox"][3]
else:
larger_layout["layout_bbox"][3] = smaller_layout["layout_bbox"][1]
# todo 排序调整布局边界框列表
if distance_y > 0 and larger_layout[1] < smaller_layout[3]:
larger_layout[1] = smaller_layout[3]+1
if distance_y < 0 and larger_layout[3] > smaller_layout[1]:
larger_layout[3] = smaller_layout[1]-1
# 排序调整布局边界框列表
new_bboxes = []
for layout_bbox in layout_bboxes:
new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None,None])
layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
# 返回排序调整后的布局边界框列表
return layout_bboxes
......@@ -79,6 +83,7 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
"""
page_id = ocr_page_info['page_info']['page_no']-1
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
# 初始化布局边界框列表
layout_bboxes = []
......@@ -88,12 +93,9 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
# 创建子布局的边界框字典
layout_bbox = {
"layout_bbox": bbox,
}
# 将子布局的边界框添加到列表中
layout_bboxes.append(layout_bbox)
layout_bboxes.append(bbox)
# 初始化新的布局边界框列表
new_layout_bboxes = []
......@@ -102,14 +104,14 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
# 初始化标记变量,用于判断当前边界框是否需要保留
keep = True
# 获取当前边界框的坐标信息
box_i = layout_bboxes[i]["layout_bbox"]
box_i = layout_bboxes[i]
# 遍历其他边界框
for j in range(len(layout_bboxes)):
# 排除当前边界框自身
if i != j:
# 获取其他边界框的坐标信息
box_j = layout_bboxes[j]["layout_bbox"]
box_j = layout_bboxes[j]
# 检测box_i是否被box_j包含
if _is_in(box_i, box_j):
# 如果当前边界框被其他边界框包含,则标记为不需要保留
......@@ -122,7 +124,10 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
new_layout_bboxes.append(layout_bboxes[i])
# 对新的布局边界框列表进行排序调整
layout_bboxes = adjust_layouts(new_layout_bboxes)
page_width = page.rect.width
page_height = page.rect.height
page_boundry = [0, 0, page_width, page_height]
layout_bboxes = adjust_layouts(new_layout_bboxes, page_boundry, page_id)
# 返回排序调整后的布局边界框列表
return layout_bboxes
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment