Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
7631907f
Commit
7631907f
authored
Apr 25, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix interline_equations block
parent
351a3ce1
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
22 additions
and
23 deletions
+22
-23
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+21
-22
pdf_parse_by_ocr_v2.py
magic_pdf/pdf_parse_by_ocr_v2.py
+1
-1
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
7631907f
...
...
@@ -95,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
page_markdown
=
[]
for
para_block
in
paras_of_layout
:
para_text
=
''
para_type
=
para_block
.
get
(
'type'
)
para_type
=
para_block
[
'type'
]
if
para_type
==
BlockType
.
Text
:
para_text
=
merge_para_with_text
(
para_block
)
elif
para_type
==
BlockType
.
Title
:
...
...
@@ -106,32 +106,30 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
if
mode
==
'nlp'
:
continue
elif
mode
==
'mm'
:
img_blocks
=
para_block
.
get
(
'blocks'
)
for
img_block
in
img_blocks
:
if
img_block
.
get
(
'type'
)
==
BlockType
.
ImageBody
:
for
line
in
img_block
.
get
(
'lines'
):
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
ImageBody
:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
.
get
(
'type'
)
==
ContentType
.
Image
:
if
span
[
'type'
]
==
ContentType
.
Image
:
para_text
=
f
"
\n
})
\n
"
for
img_block
in
img_blocks
:
if
img_block
.
get
(
'type'
)
==
BlockType
.
ImageCaption
:
para_text
+=
merge_para_with_text
(
img_
block
)
for
block
in
para_block
[
'blocks'
]
:
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
para_text
+=
merge_para_with_text
(
block
)
elif
para_type
==
BlockType
.
Table
:
if
mode
==
'nlp'
:
continue
elif
mode
==
'mm'
:
table_blocks
=
para_block
.
get
(
'blocks'
)
for
table_block
in
table_blocks
:
if
table_block
.
get
(
'type'
)
==
BlockType
.
TableBody
:
for
line
in
table_block
.
get
(
'lines'
):
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
TableBody
:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
.
get
(
'type'
)
==
ContentType
.
Table
:
if
span
[
'type'
]
==
ContentType
.
Table
:
para_text
=
f
"
\n
})
\n
"
for
table_block
in
table_blocks
:
if
table_block
.
get
(
'type'
)
==
BlockType
.
TableCaption
:
para_text
+=
merge_para_with_text
(
table_
block
)
elif
table_block
.
get
(
'type'
)
==
BlockType
.
TableFootnote
:
para_text
+=
merge_para_with_text
(
table_
block
)
for
block
in
para_block
[
'blocks'
]
:
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
para_text
+=
merge_para_with_text
(
block
)
elif
block
[
'type'
]
==
BlockType
.
TableFootnote
:
para_text
+=
merge_para_with_text
(
block
)
if
para_text
.
strip
()
==
''
:
continue
...
...
@@ -141,11 +139,11 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
return
page_markdown
def
merge_para_with_text
(
para
):
def
merge_para_with_text
(
para
_block
):
para_text
=
''
for
line
in
para
[
'lines'
]:
for
line
in
para
_block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
span_type
=
span
.
get
(
'type'
)
span_type
=
span
[
'type'
]
content
=
''
language
=
''
if
span_type
==
ContentType
.
Text
:
...
...
@@ -159,6 +157,7 @@ def merge_para_with_text(para):
content
=
f
"${span['content']}$"
elif
span_type
==
ContentType
.
InterlineEquation
:
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
if
content
!=
''
:
if
language
==
'en'
:
# 英文语境下 content间需要空格分隔
para_text
+=
content
+
' '
...
...
magic_pdf/pdf_parse_by_ocr_v2.py
View file @
7631907f
...
...
@@ -61,7 +61,7 @@ def parse_pdf_by_ocr(pdf_bytes,
'''将所有区块的bbox整理到一起'''
all_bboxes
=
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation
_block
s
,
page_w
,
page_h
)
interline_equations
,
page_w
,
page_h
)
'''根据区块信息计算layout'''
page_boundry
=
[
0
,
0
,
page_w
,
page_h
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment