Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
caa1588a
Commit
caa1588a
authored
Mar 07, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ocr拼接逻辑更新
parent
a0be4652
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
55 additions
and
25 deletions
+55
-25
ocr_dict_merge.py
magic_pdf/libs/ocr_dict_merge.py
+17
-3
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+38
-22
No files found.
magic_pdf/libs/ocr_dict_merge.py
View file @
caa1588a
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
,
get_minbox_if_overlap_by_ratio
def
merge_spans
(
spans
):
# 删除重叠spans中较小的那些
def
remove_overlaps_min_spans
(
spans
):
for
span1
in
spans
.
copy
():
for
span2
in
spans
.
copy
():
if
span1
!=
span2
:
overlap_box
=
get_minbox_if_overlap_by_ratio
(
span1
[
'bbox'
],
span2
[
'bbox'
],
0.8
)
if
overlap_box
is
not
None
:
bbox_to_remove
=
next
((
span
for
span
in
spans
if
span
[
'bbox'
]
==
overlap_box
),
None
)
if
bbox_to_remove
is
not
None
:
spans
.
remove
(
bbox_to_remove
)
return
spans
def
merge_spans_to_line
(
spans
):
# 按照y0坐标排序
# 按照y0坐标排序
spans
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
1
])
spans
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
1
])
...
@@ -9,7 +22,8 @@ def merge_spans(spans):
...
@@ -9,7 +22,8 @@ def merge_spans(spans):
current_line
=
[
spans
[
0
]]
current_line
=
[
spans
[
0
]]
for
span
in
spans
[
1
:]:
for
span
in
spans
[
1
:]:
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
if
span
[
'type'
]
==
"displayed_equation"
or
any
(
s
[
'type'
]
==
"displayed_equation"
for
s
in
current_line
):
# image和table类型,同上
if
span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
or
any
(
s
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
for
s
in
current_line
):
# 则开始新行
# 则开始新行
lines
.
append
(
current_line
)
lines
.
append
(
current_line
)
current_line
=
[
span
]
current_line
=
[
span
]
...
...
magic_pdf/pdf_parse_by_ocr.py
View file @
caa1588a
from
magic_pdf.libs.boxbase
import
get_minbox_if_overlap_by_ratio
from
loguru
import
logger
from
magic_pdf.libs.ocr_dict_merge
import
merge_spans
from
magic_pdf.libs.ocr_dict_merge
import
merge_spans_to_line
,
remove_overlaps_min_spans
def
construct_page_component
(
page_id
,
text_blocks_preproc
):
def
construct_page_component
(
page_id
,
blocks
):
return_dict
=
{
return_dict
=
{
'preproc_blocks'
:
text_blocks_preproc
,
'preproc_blocks'
:
blocks
,
'page_idx'
:
page_id
'page_idx'
:
page_id
,
}
}
return
return_dict
return
return_dict
...
@@ -24,17 +25,32 @@ def parse_pdf_by_ocr(
...
@@ -24,17 +25,32 @@ def parse_pdf_by_ocr(
spans
=
[]
spans
=
[]
for
layout_det
in
layout_dets
:
for
layout_det
in
layout_dets
:
category_id
=
layout_det
[
'category_id'
]
category_id
=
layout_det
[
'category_id'
]
allow_category_id_list
=
[
13
,
14
,
15
]
allow_category_id_list
=
[
1
,
7
,
1
3
,
14
,
15
]
if
category_id
in
allow_category_id_list
:
if
category_id
in
allow_category_id_list
:
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
layout_det
[
'poly'
]
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
layout_det
[
'poly'
]
bbox
=
[
int
(
x0
),
int
(
y0
),
int
(
x1
),
int
(
y1
)]
bbox
=
[
int
(
x0
),
int
(
y0
),
int
(
x1
),
int
(
y1
)]
# 13: 'embedding', # 嵌入公式
'''要删除的'''
# 14: 'isolated', # 单行公式
# 3: 'header', # 页眉
# 15: 'ocr_text', # ocr识别文本
# 4: 'page number', # 页码
# 5: 'footnote', # 脚注
# 6: 'footer', # 页脚
'''当成span拼接的'''
# 1: 'image', # 图片
# 7: 'table', # 表格
# 13: 'inline_equation', # 行内公式
# 14: 'displayed_equation', # 行间公式
# 15: 'text', # ocr识别文本
'''layout信息'''
# 11: 'full column', # 单栏
# 12: 'sub column', # 多栏
span
=
{
span
=
{
'bbox'
:
bbox
,
'bbox'
:
bbox
,
}
}
if
category_id
==
13
:
if
category_id
==
1
:
span
[
'type'
]
=
'image'
elif
category_id
==
7
:
span
[
'type'
]
=
'table'
elif
category_id
==
13
:
span
[
'content'
]
=
layout_det
[
'latex'
]
span
[
'content'
]
=
layout_det
[
'latex'
]
span
[
'type'
]
=
'inline_equation'
span
[
'type'
]
=
'inline_equation'
elif
category_id
==
14
:
elif
category_id
==
14
:
...
@@ -48,18 +64,18 @@ def parse_pdf_by_ocr(
...
@@ -48,18 +64,18 @@ def parse_pdf_by_ocr(
else
:
else
:
continue
continue
#
合并重叠的spans
#
删除重叠spans中较小的那些
for
span1
in
spans
.
copy
():
spans
=
remove_overlaps_min_spans
(
spans
)
for
span2
in
spans
.
copy
():
if
span1
!=
span2
:
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
overlap_box
=
get_minbox_if_overlap_by_ratio
(
span1
[
'bbox'
],
span2
[
'bbox'
],
0.8
)
if
overlap_box
is
not
None
:
bbox_to_remove
=
next
((
span
for
span
in
spans
if
span
[
'bbox'
]
==
overlap_box
),
None
)
# 将spans合并成line(从上到下,从左到右
)
if
bbox_to_remove
is
not
None
:
lines
=
merge_spans_to_line
(
spans
)
spans
.
remove
(
bbox_to_remove
)
# logger.info(lines
)
#
将spans合并成line
#
从ocr_page_info中获取layout信息
lines
=
merge_spans
(
spans
)
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks
=
[]
blocks
=
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment