Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
016f871a
Unverified
Commit
016f871a
authored
Apr 29, 2024
by
myhloli
Committed by
GitHub
Apr 29, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #86 from myhloli/master
fix equation replace type
parents
c8ccc390
232964d0
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
32 additions
and
18 deletions
+32
-18
pdf_parse_by_txt_v2.py
magic_pdf/pdf_parse_by_txt_v2.py
+26
-12
equations_replace.py
magic_pdf/pre_proc/equations_replace.py
+6
-6
No files found.
magic_pdf/pdf_parse_by_txt_v2.py
View file @
016f871a
...
...
@@ -21,11 +21,7 @@ from magic_pdf.pre_proc.ocr_span_list_modify import (
remove_overlaps_min_spans
,
get_qa_need_list_v2
,
)
from
magic_pdf.pre_proc.equations_replace
import
(
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
,
)
from
magic_pdf.pre_proc.equations_replace
import
(
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
...
...
@@ -55,6 +51,23 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
bbox
=
span
[
"bbox"
]
if
float_equal
(
bbox
[
0
],
bbox
[
2
])
or
float_equal
(
bbox
[
1
],
bbox
[
3
]):
continue
if
span
.
get
(
'type'
)
==
ContentType
.
InlineEquation
:
spans
.
append
(
{
"bbox"
:
list
(
span
[
"bbox"
]),
"content"
:
span
[
"latex"
],
"type"
:
ContentType
.
InlineEquation
,
}
)
elif
span
.
get
(
'type'
)
==
ContentType
.
InterlineEquation
:
spans
.
append
(
{
"bbox"
:
list
(
span
[
"bbox"
]),
"content"
:
span
[
"latex"
],
"type"
:
ContentType
.
InterlineEquation
,
}
)
else
:
spans
.
append
(
{
"bbox"
:
list
(
span
[
"bbox"
]),
...
...
@@ -65,6 +78,7 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
return
spans
def
replace_text_span
(
pymu_spans
,
ocr_spans
):
return
list
(
filter
(
lambda
x
:
x
[
"type"
]
!=
ContentType
.
Text
,
ocr_spans
))
+
pymu_spans
...
...
magic_pdf/pre_proc/equations_replace.py
View file @
016f871a
...
...
@@ -191,13 +191,13 @@ def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list):
"spans"
:
[
{
"size"
:
9.962599754333496
,
"
_
type"
:
TYPE_INTERLINE_EQUATION
,
"type"
:
TYPE_INTERLINE_EQUATION
,
"flags"
:
4
,
"font"
:
TYPE_INTERLINE_EQUATION
,
"color"
:
0
,
"ascender"
:
0.9409999847412109
,
"descender"
:
-
0.3050000071525574
,
"
text"
:
f
"
\n
$$
\n
{latex_content}
\n
$$
\n
"
,
"
latex"
:
latex_content
,
"origin"
:
[
bbox
[
0
],
bbox
[
1
]],
"bbox"
:
bbox
,
}
...
...
@@ -309,13 +309,13 @@ def replace_line_v2(eqinfo, line):
equation_span
=
{
"size"
:
9.962599754333496
,
"
_
type"
:
TYPE_INLINE_EQUATION
,
"type"
:
TYPE_INLINE_EQUATION
,
"flags"
:
4
,
"font"
:
TYPE_INLINE_EQUATION
,
"color"
:
0
,
"ascender"
:
0.9409999847412109
,
"descender"
:
-
0.3050000071525574
,
"
text
"
:
""
,
"
latex
"
:
""
,
"origin"
:
[
337.1410153102337
,
216.0205245153934
],
"bbox"
:
[
337.1410153102337
,
...
...
@@ -325,11 +325,11 @@ def replace_line_v2(eqinfo, line):
],
}
# equation_span = line['spans'][0].copy()
equation_span
[
"
text"
]
=
f
" ${eqinfo['latex']}$ "
equation_span
[
"
latex"
]
=
eqinfo
[
'latex'
]
equation_span
[
"bbox"
]
=
[
x0
,
equation_span
[
"bbox"
][
1
],
x1
,
equation_span
[
"bbox"
][
3
]]
equation_span
[
"origin"
]
=
[
equation_span
[
"bbox"
][
0
],
equation_span
[
"bbox"
][
1
]]
equation_span
[
"chars"
]
=
delete_chars
equation_span
[
"
_
type"
]
=
TYPE_INLINE_EQUATION
equation_span
[
"type"
]
=
TYPE_INLINE_EQUATION
equation_span
[
"_eq_bbox"
]
=
eqinfo
[
"bbox"
]
line
[
"spans"
]
.
insert
(
first_overlap_span_idx
+
1
,
equation_span
)
# 放入公式
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment