Commit 83deab21 authored by liukaiwen's avatar liukaiwen

lkw

parent c38c784e
from magic_pdf.libs.commons import fitz # PyMuPDF
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
import json
def read_json_file(file_path):
with open(file_path, 'r') as f:
data = json.load(f)
return data
# PDF文件路径
pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf"
pdf_path = "D:\\projects\\Magic-PDF\\ocr_demo\\ocr_0_org.pdf"
doc = fitz.open(pdf_path) # Open the PDF
# 你的数据
data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]]
ocr_json_file_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_0.json"
ocr_pdf_info = read_json_file(ocr_json_file_path)
pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
data_list = []
for page in pdf_info_dict.values():
page_list = []
blocks = page.get("preproc_blocks")
for block in blocks:
lines = block.get("lines")
for line in lines:
spans = line.get("spans")
for span in spans:
page_list.append(span["bbox"])
data_list.append(page_list)
# 对每个页面进行处理
for i, page in enumerate(doc):
# 获取当前页面的数据
page_data = data[i]
page_data = data_list[i]
for img in page_data:
x0, y0, x1, y1, _ = img
x0, y0, x1, y1 = img
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True) # Draw the rectangle
# Save the PDF
doc.save("D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018_new.pdf")
\ No newline at end of file
doc.save("D:\\projects\\Magic-PDF\\ocr_demo\\ocr_0_new.pdf")
\ No newline at end of file
......@@ -74,6 +74,7 @@ def modify_y_axis(spans: list):
current_line = [spans[0]]
if spans[0]["type"] in ["displayed_equation", "image", "table"]:
displayed_list.append(spans[0])
line_first_y0 = spans[0]["bbox"][1]
line_first_y = spans[0]["bbox"][3]
#用于给行间公式搜索
......@@ -89,15 +90,16 @@ def modify_y_axis(spans: list):
# 则开始新行
lines.append(current_line)
current_line = [span]
line_first_y0 = spans[0]["bbox"][1]
line_first_y = spans[0]["bbox"][3]
line_first_y0 = span["bbox"][1]
line_first_y = span["bbox"][3]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
span["bbox"][1] = line_first_y0
span["bbox"][3] = line_first_y
if span["bbox"][1] < line_first_y0:
line_first_y0 = span["bbox"][1]
if span["bbox"][3] > line_first_y:
line_first_y = span["bbox"][3]
current_line.append(span)
else:
......@@ -111,18 +113,41 @@ def modify_y_axis(spans: list):
# 添加最后一行
if current_line:
lines.append(current_line)
if len(current_line)>1 or current_line[0]["type"] in ["text", "inline_equation"]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
for line in text_inline_lines:
# 按照x0坐标排序
line.sort(key=lambda span: span[0]['bbox'][0])
current_line = line[0]
current_line.sort(key=lambda span: span['bbox'][0])
#调整每一个文字行内bbox统一
for line in text_inline_lines:
current_line, (line_first_y0, line_first_y) = line
for span in current_line:
span["bbox"][1] = line_first_y0
span["bbox"][3] = line_first_y
#错误行间公式转行内公式
j = 0
for i in range(len(displayed_list)):
span = displayed_list[i]
span_y0, span_y = span["bbox"][1], span["bbox"][3]
while j < len(text_inline_lines):
text_line = text_inline_lines[j]
y0, y1 = text_line[1]
if span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1 and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
span["bbox"][1] = y0
span["bbox"][3] = y1
if span["type"] == "displayed_equation":
span["type"] = "inline_equation"
break
elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
break
else:
j += 1
return spans
......
from loguru import logger
from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans, modify_y_axis
def construct_page_component(page_id, blocks):
......@@ -68,7 +68,7 @@ def parse_pdf_by_ocr(
spans = remove_overlaps_min_spans(spans)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
#spans = modify_y_axis(spans)
# 将spans合并成line(从上到下,从左到右)
lines = merge_spans_to_line(spans)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment