Commit 94a7ba3d authored by liukaiwen's avatar liukaiwen

lkw

parent da509143
from magic_pdf.libs.commons import fitz # PyMuPDF from pathlib import Path
from magic_pdf.libs.commons import fitz, join_path # PyMuPDF
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
import json import json
import os
...@@ -20,7 +22,19 @@ doc = fitz.open(pdf_path) # Open the PDF ...@@ -20,7 +22,19 @@ doc = fitz.open(pdf_path) # Open the PDF
data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]] data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]]
ocr_json_file_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_0.json" ocr_json_file_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_0.json"
ocr_pdf_info = read_json_file(ocr_json_file_path) ocr_pdf_info = read_json_file(ocr_json_file_path)
pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
pth = Path(ocr_json_file_path)
book_name = pth.name
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
save_path = join_path(save_tmp_path, "md")
pdf_info_dict = parse_pdf_by_ocr(
pdf_path,
None,
ocr_pdf_info,
save_path,
book_name,
debug_mode=True)
data_list = [] data_list = []
for page in pdf_info_dict.values(): for page in pdf_info_dict.values():
page_list = [] page_list = []
......
...@@ -4,7 +4,6 @@ import time ...@@ -4,7 +4,6 @@ import time
from loguru import logger from loguru import logger
from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans, modify_y_axis
from magic_pdf.libs.commons import read_file, join_path, fitz, get_img_s3_client, get_delta_time, get_docx_model_output from magic_pdf.libs.commons import read_file, join_path, fitz, get_img_s3_client, get_delta_time, get_docx_model_output
from magic_pdf.libs.coordinate_transform import get_scale_ratio from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.safe_filename import sanitize_filename from magic_pdf.libs.safe_filename import sanitize_filename
...@@ -14,7 +13,7 @@ from magic_pdf.pre_proc.detect_header import parse_headers ...@@ -14,7 +13,7 @@ from magic_pdf.pre_proc.detect_header import parse_headers
from magic_pdf.pre_proc.detect_page_number import parse_pageNos from magic_pdf.pre_proc.detect_page_number import parse_pageNos
from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
from magic_pdf.pre_proc.ocr_dict_merge import remove_overlaps_min_spans, merge_spans_to_line_by_layout from magic_pdf.pre_proc.ocr_dict_merge import remove_overlaps_min_spans, merge_spans_to_line_by_layout, modify_y_axis
from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes
...@@ -150,7 +149,7 @@ def parse_pdf_by_ocr( ...@@ -150,7 +149,7 @@ def parse_pdf_by_ocr(
spans = remove_overlaps_min_spans(spans) spans = remove_overlaps_min_spans(spans)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0 # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
# spans = modify_y_axis(spans) spans = modify_y_axis(spans)
# 删除remove_span_block_bboxes中的bbox # 删除remove_span_block_bboxes中的bbox
spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes) spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
......
...@@ -113,15 +113,19 @@ def modify_y_axis(spans: list): ...@@ -113,15 +113,19 @@ def modify_y_axis(spans: list):
#用于给行间公式搜索 #用于给行间公式搜索
text_inline_lines = [] text_inline_lines = []
for span in spans[1:]: for span in spans[1:]:
if span.get("content","") == "78.":
print("debug")
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation" # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# image和table类型,同上 # image和table类型,同上
if span['type'] in ["displayed_equation", "image", "table"] or any( if span['type'] in ["displayed_equation", "image", "table"] or any(
s['type'] in ["displayed_equation", "image", "table"] for s in current_line): s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
#传入 #传入
if spans[0]["type"] in ["displayed_equation", "image", "table"]: if span["type"] in ["displayed_equation", "image", "table"]:
displayed_list.append(span) displayed_list.append(span)
# 则开始新行 # 则开始新行
lines.append(current_line) lines.append(current_line)
if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
current_line = [span] current_line = [span]
line_first_y0 = span["bbox"][1] line_first_y0 = span["bbox"][1]
line_first_y = span["bbox"][3] line_first_y = span["bbox"][3]
...@@ -140,15 +144,14 @@ def modify_y_axis(spans: list): ...@@ -140,15 +144,14 @@ def modify_y_axis(spans: list):
lines.append(current_line) lines.append(current_line)
text_inline_lines.append((current_line, (line_first_y0, line_first_y))) text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
current_line = [span] current_line = [span]
line_first_y0 = spans[0]["bbox"][1] line_first_y0 = span["bbox"][1]
line_first_y = spans[0]["bbox"][3] line_first_y = span["bbox"][3]
# 添加最后一行 # 添加最后一行
if current_line: if current_line:
lines.append(current_line) lines.append(current_line)
if len(current_line)>1 or current_line[0]["type"] in ["text", "inline_equation"]: if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y))) text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
for line in text_inline_lines: for line in text_inline_lines:
# 按照x0坐标排序 # 按照x0坐标排序
current_line = line[0] current_line = line[0]
...@@ -164,14 +167,17 @@ def modify_y_axis(spans: list): ...@@ -164,14 +167,17 @@ def modify_y_axis(spans: list):
#错误行间公式转行内公式 #错误行间公式转行内公式
j = 0 j = 0
for i in range(len(displayed_list)): for i in range(len(displayed_list)):
if i == 8:
print("debug")
span = displayed_list[i] span = displayed_list[i]
span_y0, span_y = span["bbox"][1], span["bbox"][3] span_y0, span_y = span["bbox"][1], span["bbox"][3]
while j < len(text_inline_lines): while j < len(text_inline_lines):
text_line = text_inline_lines[j] text_line = text_inline_lines[j]
y0, y1 = text_line[1] y0, y1 = text_line[1]
if span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1 and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)): if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
span["bbox"][1] = y0 span["bbox"][1] = y0
span["bbox"][3] = y1 # span["bbox"][3] = y1
if span["type"] == "displayed_equation": if span["type"] == "displayed_equation":
span["type"] = "inline_equation" span["type"] = "inline_equation"
break break
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment