Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
3c8b2545
Commit
3c8b2545
authored
Mar 12, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修复了layout相交的分离算法,并修复layout排序有误的问题
parent
9cc53a5e
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
34 additions
and
29 deletions
+34
-29
ocr_detect_layout.py
magic_pdf/pre_proc/ocr_detect_layout.py
+34
-29
No files found.
magic_pdf/pre_proc/ocr_detect_layout.py
View file @
3c8b2545
import
fitz
import
fitz
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
from
magic_pdf.libs.boxbase
import
_is_part_overlap
,
_is_in
from
magic_pdf.libs.boxbase
import
_is_part_overlap
,
_is_in
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
...
@@ -26,23 +27,16 @@ def get_area(bbox):
...
@@ -26,23 +27,16 @@ def get_area(bbox):
return
(
bbox
[
2
]
-
bbox
[
0
])
*
(
bbox
[
3
]
-
bbox
[
1
])
return
(
bbox
[
2
]
-
bbox
[
0
])
*
(
bbox
[
3
]
-
bbox
[
1
])
def
adjust_layouts
(
layout_bboxes
):
def
adjust_layouts
(
layout_bboxes
,
page_boundry
,
page_id
):
# 遍历所有布局框
# 遍历所有布局框
for
i
in
range
(
len
(
layout_bboxes
)):
for
i
in
range
(
len
(
layout_bboxes
)):
# 遍历当前布局框之后的布局框
# 遍历当前布局框之后的布局框
for
j
in
range
(
i
+
1
,
len
(
layout_bboxes
)):
for
j
in
range
(
i
+
1
,
len
(
layout_bboxes
)):
# 判断两个布局框是否重叠
# 判断两个布局框是否重叠
if
_is_part_overlap
(
layout_bboxes
[
i
]
[
"layout_bbox"
],
layout_bboxes
[
j
][
"layout_bbox"
]):
if
_is_part_overlap
(
layout_bboxes
[
i
]
,
layout_bboxes
[
j
]):
# 计算每个布局框的中心点坐标和面积
# 计算每个布局框的中心点坐标和面积
center_i
=
get_center_point
(
layout_bboxes
[
i
][
"layout_bbox"
])
area_i
=
get_area
(
layout_bboxes
[
i
])
area_i
=
get_area
(
layout_bboxes
[
i
][
"layout_bbox"
])
area_j
=
get_area
(
layout_bboxes
[
j
])
center_j
=
get_center_point
(
layout_bboxes
[
j
][
"layout_bbox"
])
area_j
=
get_area
(
layout_bboxes
[
j
][
"layout_bbox"
])
# 计算横向和纵向的距离差
dx
=
abs
(
center_i
[
0
]
-
center_j
[
0
])
dy
=
abs
(
center_i
[
1
]
-
center_j
[
1
])
# 较大布局框和较小布局框的赋值
# 较大布局框和较小布局框的赋值
if
area_i
>
area_j
:
if
area_i
>
area_j
:
...
@@ -50,19 +44,29 @@ def adjust_layouts(layout_bboxes):
...
@@ -50,19 +44,29 @@ def adjust_layouts(layout_bboxes):
else
:
else
:
larger_layout
,
smaller_layout
=
layout_bboxes
[
j
],
layout_bboxes
[
i
]
larger_layout
,
smaller_layout
=
layout_bboxes
[
j
],
layout_bboxes
[
i
]
center_large
=
get_center_point
(
larger_layout
)
center_small
=
get_center_point
(
smaller_layout
)
# 计算横向和纵向的距离差
distance_x
=
center_large
[
0
]
-
center_small
[
0
]
distance_y
=
center_large
[
1
]
-
center_small
[
1
]
# 根据距离差判断重叠方向并修正边界
# 根据距离差判断重叠方向并修正边界
if
dx
>
dy
:
# 左右重叠
if
abs
(
distance_x
)
>
abs
(
distance_y
)
:
# 左右重叠
if
larger_layout
[
"layout_bbox"
][
0
]
<
smaller_layout
[
"layout_bbox"
]
[
2
]:
if
distance_x
>
0
and
larger_layout
[
0
]
<
smaller_layout
[
2
]:
larger_layout
[
"layout_bbox"
][
0
]
=
smaller_layout
[
"layout_bbox"
][
2
]
larger_layout
[
0
]
=
smaller_layout
[
2
]
+
1
else
:
if
distance_x
<
0
and
larger_layout
[
2
]
>
smaller_layout
[
0
]
:
larger_layout
[
"layout_bbox"
][
2
]
=
smaller_layout
[
"layout_bbox"
][
0
]
larger_layout
[
2
]
=
smaller_layout
[
0
]
-
1
else
:
# 上下重叠
else
:
# 上下重叠
if
larger_layout
[
"layout_bbox"
][
1
]
<
smaller_layout
[
"layout_bbox"
][
3
]:
if
distance_y
>
0
and
larger_layout
[
1
]
<
smaller_layout
[
3
]:
larger_layout
[
"layout_bbox"
][
1
]
=
smaller_layout
[
"layout_bbox"
][
3
]
larger_layout
[
1
]
=
smaller_layout
[
3
]
+
1
else
:
if
distance_y
<
0
and
larger_layout
[
3
]
>
smaller_layout
[
1
]:
larger_layout
[
"layout_bbox"
][
3
]
=
smaller_layout
[
"layout_bbox"
][
1
]
larger_layout
[
3
]
=
smaller_layout
[
1
]
-
1
# todo 排序调整布局边界框列表
# 排序调整布局边界框列表
new_bboxes
=
[]
for
layout_bbox
in
layout_bboxes
:
new_bboxes
.
append
([
layout_bbox
[
0
],
layout_bbox
[
1
],
layout_bbox
[
2
],
layout_bbox
[
3
],
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
])
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
new_bboxes
,
page_boundry
,
page_id
)
# 返回排序调整后的布局边界框列表
# 返回排序调整后的布局边界框列表
return
layout_bboxes
return
layout_bboxes
...
@@ -79,6 +83,7 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
...
@@ -79,6 +83,7 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
"""
"""
page_id
=
ocr_page_info
[
'page_info'
][
'page_no'
]
-
1
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
ocr_page_info
,
page
)
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
ocr_page_info
,
page
)
# 初始化布局边界框列表
# 初始化布局边界框列表
layout_bboxes
=
[]
layout_bboxes
=
[]
...
@@ -88,12 +93,9 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
...
@@ -88,12 +93,9 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
sub_layout
[
'poly'
]
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
sub_layout
[
'poly'
]
bbox
=
[
int
(
x0
/
horizontal_scale_ratio
),
int
(
y0
/
vertical_scale_ratio
),
bbox
=
[
int
(
x0
/
horizontal_scale_ratio
),
int
(
y0
/
vertical_scale_ratio
),
int
(
x1
/
horizontal_scale_ratio
),
int
(
y1
/
vertical_scale_ratio
)]
int
(
x1
/
horizontal_scale_ratio
),
int
(
y1
/
vertical_scale_ratio
)]
# 创建子布局的边界框字典
layout_bbox
=
{
"layout_bbox"
:
bbox
,
}
# 将子布局的边界框添加到列表中
# 将子布局的边界框添加到列表中
layout_bboxes
.
append
(
layout_
bbox
)
layout_bboxes
.
append
(
bbox
)
# 初始化新的布局边界框列表
# 初始化新的布局边界框列表
new_layout_bboxes
=
[]
new_layout_bboxes
=
[]
...
@@ -102,14 +104,14 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
...
@@ -102,14 +104,14 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
# 初始化标记变量,用于判断当前边界框是否需要保留
# 初始化标记变量,用于判断当前边界框是否需要保留
keep
=
True
keep
=
True
# 获取当前边界框的坐标信息
# 获取当前边界框的坐标信息
box_i
=
layout_bboxes
[
i
]
[
"layout_bbox"
]
box_i
=
layout_bboxes
[
i
]
# 遍历其他边界框
# 遍历其他边界框
for
j
in
range
(
len
(
layout_bboxes
)):
for
j
in
range
(
len
(
layout_bboxes
)):
# 排除当前边界框自身
# 排除当前边界框自身
if
i
!=
j
:
if
i
!=
j
:
# 获取其他边界框的坐标信息
# 获取其他边界框的坐标信息
box_j
=
layout_bboxes
[
j
]
[
"layout_bbox"
]
box_j
=
layout_bboxes
[
j
]
# 检测box_i是否被box_j包含
# 检测box_i是否被box_j包含
if
_is_in
(
box_i
,
box_j
):
if
_is_in
(
box_i
,
box_j
):
# 如果当前边界框被其他边界框包含,则标记为不需要保留
# 如果当前边界框被其他边界框包含,则标记为不需要保留
...
@@ -122,7 +124,10 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
...
@@ -122,7 +124,10 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
new_layout_bboxes
.
append
(
layout_bboxes
[
i
])
new_layout_bboxes
.
append
(
layout_bboxes
[
i
])
# 对新的布局边界框列表进行排序调整
# 对新的布局边界框列表进行排序调整
layout_bboxes
=
adjust_layouts
(
new_layout_bboxes
)
page_width
=
page
.
rect
.
width
page_height
=
page
.
rect
.
height
page_boundry
=
[
0
,
0
,
page_width
,
page_height
]
layout_bboxes
=
adjust_layouts
(
new_layout_bboxes
,
page_boundry
,
page_id
)
# 返回排序调整后的布局边界框列表
# 返回排序调整后的布局边界框列表
return
layout_bboxes
return
layout_bboxes
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment