Commit efca3cab authored by 许瑞's avatar 许瑞

fix: lost span and uncorrect inline equations pos

parent 016f871a
...@@ -107,10 +107,10 @@ def _is_in_or_part_overlap(box1, box2) -> bool: ...@@ -107,10 +107,10 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
or y0_1 > y1_2 or y0_1 > y1_2
) # box1在box2的下边 ) # box1在box2的下边
def remove_text_block_overlap_interline_equation_bbox( def remove_text_block_overlap_interline_equation_bbox(
interline_eq_bboxes, pymu_block_list interline_eq_bboxes, pymu_block_list
): ):
"""消除掉行行内公式有部分重叠的文本块的内容。 """消除掉行行内公式有部分重叠的文本块的内容。
同时重新计算消除重叠之后文本块的大小""" 同时重新计算消除重叠之后文本块的大小"""
deleted_block = [] deleted_block = []
...@@ -191,13 +191,13 @@ def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list): ...@@ -191,13 +191,13 @@ def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list):
"spans": [ "spans": [
{ {
"size": 9.962599754333496, "size": 9.962599754333496,
"type": TYPE_INTERLINE_EQUATION, "_type": TYPE_INTERLINE_EQUATION,
"flags": 4, "flags": 4,
"font": TYPE_INTERLINE_EQUATION, "font": TYPE_INTERLINE_EQUATION,
"color": 0, "color": 0,
"ascender": 0.9409999847412109, "ascender": 0.9409999847412109,
"descender": -0.3050000071525574, "descender": -0.3050000071525574,
"latex": latex_content, "text": f"\n$$\n{latex_content}\n$$\n",
"origin": [bbox[0], bbox[1]], "origin": [bbox[0], bbox[1]],
"bbox": bbox, "bbox": bbox,
} }
...@@ -309,27 +309,22 @@ def replace_line_v2(eqinfo, line): ...@@ -309,27 +309,22 @@ def replace_line_v2(eqinfo, line):
equation_span = { equation_span = {
"size": 9.962599754333496, "size": 9.962599754333496,
"type": TYPE_INLINE_EQUATION, "_type": TYPE_INLINE_EQUATION,
"flags": 4, "flags": 4,
"font": TYPE_INLINE_EQUATION, "font": TYPE_INLINE_EQUATION,
"color": 0, "color": 0,
"ascender": 0.9409999847412109, "ascender": 0.9409999847412109,
"descender": -0.3050000071525574, "descender": -0.3050000071525574,
"latex": "", "text": "",
"origin": [337.1410153102337, 216.0205245153934], "origin": [337.1410153102337, 216.0205245153934],
"bbox": [ "bbox": eqinfo["bbox"]
337.1410153102337,
216.0205245153934,
390.4496373892022,
228.50171037628277,
],
} }
# equation_span = line['spans'][0].copy() # equation_span = line['spans'][0].copy()
equation_span["latex"] = eqinfo['latex'] equation_span["text"] = f" ${eqinfo['latex']}$ "
equation_span["bbox"] = [x0, equation_span["bbox"][1], x1, equation_span["bbox"][3]] equation_span["bbox"] = [x0, equation_span["bbox"][1], x1, equation_span["bbox"][3]]
equation_span["origin"] = [equation_span["bbox"][0], equation_span["bbox"][1]] equation_span["origin"] = [equation_span["bbox"][0], equation_span["bbox"][1]]
equation_span["chars"] = delete_chars equation_span["chars"] = delete_chars
equation_span["type"] = TYPE_INLINE_EQUATION equation_span["_type"] = TYPE_INLINE_EQUATION
equation_span["_eq_bbox"] = eqinfo["bbox"] equation_span["_eq_bbox"] = eqinfo["bbox"]
line["spans"].insert(first_overlap_span_idx + 1, equation_span) # 放入公式 line["spans"].insert(first_overlap_span_idx + 1, equation_span) # 放入公式
...@@ -363,6 +358,11 @@ def replace_line_v2(eqinfo, line): ...@@ -363,6 +358,11 @@ def replace_line_v2(eqinfo, line):
line["spans"].remove(first_overlap_span) line["spans"].remove(first_overlap_span)
if len(tail_span_chars) > 0: if len(tail_span_chars) > 0:
min_of_tail_span_x0 = min([chr["bbox"][0] for chr in tail_span_chars])
min_of_tail_span_y0 = min([chr["bbox"][1] for chr in tail_span_chars])
max_of_tail_span_x1 = max([chr["bbox"][2] for chr in tail_span_chars])
max_of_tail_span_y1 = max([chr["bbox"][3] for chr in tail_span_chars])
if last_overlap_span == first_overlap_span: # 这个时候应该插入一个新的 if last_overlap_span == first_overlap_span: # 这个时候应该插入一个新的
tail_span_txt = "".join([char["c"] for char in tail_span_chars]) tail_span_txt = "".join([char["c"] for char in tail_span_chars])
last_span_to_insert = last_overlap_span.copy() last_span_to_insert = last_overlap_span.copy()
...@@ -370,6 +370,14 @@ def replace_line_v2(eqinfo, line): ...@@ -370,6 +370,14 @@ def replace_line_v2(eqinfo, line):
last_span_to_insert["text"] = "".join( last_span_to_insert["text"] = "".join(
[char["c"] for char in tail_span_chars] [char["c"] for char in tail_span_chars]
) )
if equation_span["bbox"][2] >= last_overlap_span["bbox"][2]:
last_span_to_insert["bbox"] = (
min_of_tail_span_x0,
min_of_tail_span_y0,
max_of_tail_span_x1,
max_of_tail_span_y1
)
else:
last_span_to_insert["bbox"] = ( last_span_to_insert["bbox"] = (
min([chr["bbox"][0] for chr in tail_span_chars]), min([chr["bbox"][0] for chr in tail_span_chars]),
last_overlap_span["bbox"][1], last_overlap_span["bbox"][1],
...@@ -460,17 +468,23 @@ def replace_equations_in_textblock( ...@@ -460,17 +468,23 @@ def replace_equations_in_textblock(
""" """
替换行间和和行内公式为latex 替换行间和和行内公式为latex
""" """
# debug
from magic_pdf.debug_utils import flatten_spans
raw_text_blocks = remove_text_block_in_interline_equation_bbox( raw_text_blocks = remove_text_block_in_interline_equation_bbox(
interline_equation_bboxes, raw_text_blocks interline_equation_bboxes, raw_text_blocks
) # 消除重叠:第一步,在公式内部的 ) # 消除重叠:第一步,在公式内部的
flatten_spans(raw_text_blocks)
raw_text_blocks = remove_text_block_overlap_interline_equation_bbox( raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(
interline_equation_bboxes, raw_text_blocks interline_equation_bboxes, raw_text_blocks
) # 消重,第二步,和公式覆盖的 ) # 消重,第二步,和公式覆盖的
insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks) flatten_spans(raw_text_blocks)
insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
flatten_spans(raw_text_blocks)
raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks) raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks)
flatten_spans(raw_text_blocks)
return raw_text_blocks return raw_text_blocks
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment