Commit ebc2f057 authored by 许瑞's avatar 许瑞

fix: remove_overlap leading zero height case

parent c2d5dd2b
...@@ -2,4 +2,8 @@ def float_gt(a, b): ...@@ -2,4 +2,8 @@ def float_gt(a, b):
if 0.0001 >= abs(a -b): if 0.0001 >= abs(a -b):
return False return False
return a > b return a > b
\ No newline at end of file def float_equal(a, b):
if 0.0001 >= abs(a-b):
return True
return False
\ No newline at end of file
...@@ -31,7 +31,8 @@ from magic_pdf.pre_proc.equations_replace import ( ...@@ -31,7 +31,8 @@ from magic_pdf.pre_proc.equations_replace import (
replace_equations_in_textblock, replace_equations_in_textblock,
) )
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from magic_pdf.libs.math import float_equal
from magic_pdf.para.para_split_v2 import para_split
def txt_spans_extract(pdf_page, inline_equations, interline_equations): def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
...@@ -48,6 +49,9 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations): ...@@ -48,6 +49,9 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
for v in text_blocks: for v in text_blocks:
for line in v["lines"]: for line in v["lines"]:
for span in line["spans"]: for span in line["spans"]:
bbox = span["bbox"]
if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
continue
spans.append( spans.append(
{ {
"bbox": list(span["bbox"]), "bbox": list(span["bbox"]),
...@@ -167,7 +171,7 @@ def parse_pdf_by_txt( ...@@ -167,7 +171,7 @@ def parse_pdf_by_txt(
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f"page_{page_id}"] = page_info
"""分段""" """分段"""
pass para_split(pdf_info_dict, debug_mode=debug_mode)
"""dict转list""" """dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict) pdf_info_list = dict_to_list(pdf_info_dict)
......
...@@ -5,7 +5,7 @@ def _remove_overlap_between_bbox(spans): ...@@ -5,7 +5,7 @@ def _remove_overlap_between_bbox(spans):
res = [] res = []
for v in spans: for v in spans:
for i in range(len(res)): for i in range(len(res)):
if _is_in(res[i]["bbox"], v["bbox"]): if _is_in(res[i]["bbox"], v["bbox"]) or _is_in(v["bbox"], res[i]["bbox"]):
continue continue
if _is_in_or_part_overlap(res[i]["bbox"], v["bbox"]): if _is_in_or_part_overlap(res[i]["bbox"], v["bbox"]):
ix0, iy0, ix1, iy1 = res[i]["bbox"] ix0, iy0, ix1, iy1 = res[i]["bbox"]
...@@ -17,21 +17,21 @@ def _remove_overlap_between_bbox(spans): ...@@ -17,21 +17,21 @@ def _remove_overlap_between_bbox(spans):
if diff_y > diff_x: if diff_y > diff_x:
if x1 >= ix1: if x1 >= ix1:
mid = (x0 + ix1) // 2 mid = (x0 + ix1) // 2
ix1 = min(mid, ix1) ix1 = min(mid - 0.25, ix1)
x0 = max(mid + 1, x0) x0 = max(mid + 0.25, x0)
else: else:
mid = (ix0 + x1) // 2 mid = (ix0 + x1) // 2
ix0 = max(mid + 1, ix0) ix0 = max(mid + 0.25, ix0)
x1 = min(mid, x1) x1 = min(mid -0.25, x1)
else: else:
if y1 >= iy1: if y1 >= iy1:
mid = (y0 + iy1) // 2 mid = (y0 + iy1) // 2
y0 = max(mid + 1, y0) y0 = max(mid + 0.25, y0)
iy1 = min(iy1, mid) iy1 = min(iy1, mid-0.25)
else: else:
mid = (iy0 + y1) // 2 mid = (iy0 + y1) // 2
y1 = min(y1, mid) y1 = min(y1, mid-0.25)
iy0 = max(mid + 1, iy0) iy0 = max(mid + 0.25, iy0)
res[i]["bbox"] = [ix0, iy0, ix1, iy1] res[i]["bbox"] = [ix0, iy0, ix1, iy1]
v["bbox"] = [x0, y0, x1, y1] v["bbox"] = [x0, y0, x1, y1]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment