Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
92f66849
Unverified
Commit
92f66849
authored
Apr 29, 2024
by
myhloli
Committed by
GitHub
Apr 29, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #88 from icecraft/fix/lost_text_and_uncorrect_inline_equation_pos
Fix/lost text and uncorrect inline equation pos
parents
016f871a
e91d5929
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
23 additions
and
16 deletions
+23
-16
equations_replace.py
magic_pdf/pre_proc/equations_replace.py
+23
-16
No files found.
magic_pdf/pre_proc/equations_replace.py
View file @
92f66849
...
@@ -107,10 +107,10 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
...
@@ -107,10 +107,10 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
or
y0_1
>
y1_2
or
y0_1
>
y1_2
)
# box1在box2的下边
)
# box1在box2的下边
def
remove_text_block_overlap_interline_equation_bbox
(
def
remove_text_block_overlap_interline_equation_bbox
(
interline_eq_bboxes
,
pymu_block_list
interline_eq_bboxes
,
pymu_block_list
):
):
"""消除掉行行内公式有部分重叠的文本块的内容。
"""消除掉行行内公式有部分重叠的文本块的内容。
同时重新计算消除重叠之后文本块的大小"""
同时重新计算消除重叠之后文本块的大小"""
deleted_block
=
[]
deleted_block
=
[]
...
@@ -317,12 +317,7 @@ def replace_line_v2(eqinfo, line):
...
@@ -317,12 +317,7 @@ def replace_line_v2(eqinfo, line):
"descender"
:
-
0.3050000071525574
,
"descender"
:
-
0.3050000071525574
,
"latex"
:
""
,
"latex"
:
""
,
"origin"
:
[
337.1410153102337
,
216.0205245153934
],
"origin"
:
[
337.1410153102337
,
216.0205245153934
],
"bbox"
:
[
"bbox"
:
eqinfo
[
"bbox"
]
337.1410153102337
,
216.0205245153934
,
390.4496373892022
,
228.50171037628277
,
],
}
}
# equation_span = line['spans'][0].copy()
# equation_span = line['spans'][0].copy()
equation_span
[
"latex"
]
=
eqinfo
[
'latex'
]
equation_span
[
"latex"
]
=
eqinfo
[
'latex'
]
...
@@ -363,6 +358,11 @@ def replace_line_v2(eqinfo, line):
...
@@ -363,6 +358,11 @@ def replace_line_v2(eqinfo, line):
line
[
"spans"
]
.
remove
(
first_overlap_span
)
line
[
"spans"
]
.
remove
(
first_overlap_span
)
if
len
(
tail_span_chars
)
>
0
:
if
len
(
tail_span_chars
)
>
0
:
min_of_tail_span_x0
=
min
([
chr
[
"bbox"
][
0
]
for
chr
in
tail_span_chars
])
min_of_tail_span_y0
=
min
([
chr
[
"bbox"
][
1
]
for
chr
in
tail_span_chars
])
max_of_tail_span_x1
=
max
([
chr
[
"bbox"
][
2
]
for
chr
in
tail_span_chars
])
max_of_tail_span_y1
=
max
([
chr
[
"bbox"
][
3
]
for
chr
in
tail_span_chars
])
if
last_overlap_span
==
first_overlap_span
:
# 这个时候应该插入一个新的
if
last_overlap_span
==
first_overlap_span
:
# 这个时候应该插入一个新的
tail_span_txt
=
""
.
join
([
char
[
"c"
]
for
char
in
tail_span_chars
])
tail_span_txt
=
""
.
join
([
char
[
"c"
]
for
char
in
tail_span_chars
])
last_span_to_insert
=
last_overlap_span
.
copy
()
last_span_to_insert
=
last_overlap_span
.
copy
()
...
@@ -370,12 +370,20 @@ def replace_line_v2(eqinfo, line):
...
@@ -370,12 +370,20 @@ def replace_line_v2(eqinfo, line):
last_span_to_insert
[
"text"
]
=
""
.
join
(
last_span_to_insert
[
"text"
]
=
""
.
join
(
[
char
[
"c"
]
for
char
in
tail_span_chars
]
[
char
[
"c"
]
for
char
in
tail_span_chars
]
)
)
last_span_to_insert
[
"bbox"
]
=
(
if
equation_span
[
"bbox"
][
2
]
>=
last_overlap_span
[
"bbox"
][
2
]:
min
([
chr
[
"bbox"
][
0
]
for
chr
in
tail_span_chars
]),
last_span_to_insert
[
"bbox"
]
=
(
last_overlap_span
[
"bbox"
][
1
],
min_of_tail_span_x0
,
last_overlap_span
[
"bbox"
][
2
],
min_of_tail_span_y0
,
last_overlap_span
[
"bbox"
][
3
],
max_of_tail_span_x1
,
)
max_of_tail_span_y1
)
else
:
last_span_to_insert
[
"bbox"
]
=
(
min
([
chr
[
"bbox"
][
0
]
for
chr
in
tail_span_chars
]),
last_overlap_span
[
"bbox"
][
1
],
last_overlap_span
[
"bbox"
][
2
],
last_overlap_span
[
"bbox"
][
3
],
)
# 插入到公式对象之后
# 插入到公式对象之后
equation_idx
=
line
[
"spans"
]
.
index
(
equation_span
)
equation_idx
=
line
[
"spans"
]
.
index
(
equation_span
)
line
[
"spans"
]
.
insert
(
equation_idx
+
1
,
last_span_to_insert
)
# 放入公式
line
[
"spans"
]
.
insert
(
equation_idx
+
1
,
last_span_to_insert
)
# 放入公式
...
@@ -460,17 +468,16 @@ def replace_equations_in_textblock(
...
@@ -460,17 +468,16 @@ def replace_equations_in_textblock(
"""
"""
替换行间和和行内公式为latex
替换行间和和行内公式为latex
"""
"""
raw_text_blocks
=
remove_text_block_in_interline_equation_bbox
(
raw_text_blocks
=
remove_text_block_in_interline_equation_bbox
(
interline_equation_bboxes
,
raw_text_blocks
interline_equation_bboxes
,
raw_text_blocks
)
# 消除重叠:第一步,在公式内部的
)
# 消除重叠:第一步,在公式内部的
raw_text_blocks
=
remove_text_block_overlap_interline_equation_bbox
(
raw_text_blocks
=
remove_text_block_overlap_interline_equation_bbox
(
interline_equation_bboxes
,
raw_text_blocks
interline_equation_bboxes
,
raw_text_blocks
)
# 消重,第二步,和公式覆盖的
)
# 消重,第二步,和公式覆盖的
insert_interline_equations_textblock
(
interline_equation_bboxes
,
raw_text_blocks
)
insert_interline_equations_textblock
(
interline_equation_bboxes
,
raw_text_blocks
)
raw_text_blocks
=
replace_inline_equations
(
inline_equation_bboxes
,
raw_text_blocks
)
raw_text_blocks
=
replace_inline_equations
(
inline_equation_bboxes
,
raw_text_blocks
)
return
raw_text_blocks
return
raw_text_blocks
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment