Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
fcea39d3
Commit
fcea39d3
authored
Mar 07, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加ocr模式的layout解析功能
parent
00f3e329
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
140 additions
and
15 deletions
+140
-15
ocr_demo.py
demo/ocr_demo.py
+9
-7
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+8
-8
ocr_detect_layout.py
magic_pdf/pre_proc/ocr_detect_layout.py
+123
-0
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+0
-0
No files found.
demo/ocr_demo.py
View file @
fcea39d3
...
...
@@ -28,10 +28,12 @@ def read_json_file(file_path):
if
__name__
==
'__main__'
:
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_0.json"
ocr_pdf_info
=
read_json_file
(
ocr_json_file_path
)
pdf_info_dict
=
parse_pdf_by_ocr
(
ocr_pdf_info
)
markdown_text
=
mk_nlp_markdown
(
pdf_info_dict
)
logger
.
info
(
markdown_text
)
save_markdown
(
markdown_text
,
ocr_json_file_path
)
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_1(3).json"
try
:
ocr_pdf_info
=
read_json_file
(
ocr_json_file_path
)
pdf_info_dict
=
parse_pdf_by_ocr
(
ocr_pdf_info
)
markdown_text
=
mk_nlp_markdown
(
pdf_info_dict
)
logger
.
info
(
markdown_text
)
save_markdown
(
markdown_text
,
ocr_json_file_path
)
except
Exception
as
e
:
logger
.
error
(
e
)
magic_pdf/pdf_parse_by_ocr.py
View file @
fcea39d3
from
loguru
import
logger
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_dict_merge
import
merge_spans_to_line
,
remove_overlaps_min_spans
from
magic_pdf.libs.ocr_dict_merge
import
merge_spans_to_line
,
remove_overlaps_min_spans
def
construct_page_component
(
page_id
,
blocks
):
def
construct_page_component
(
page_id
,
blocks
,
layout_bboxes
):
return_dict
=
{
'preproc_blocks'
:
blocks
,
'page_idx'
:
page_id
,
'layout_bboxes'
:
layout_bboxes
,
}
return
return_dict
...
...
@@ -74,9 +74,6 @@ def parse_pdf_by_ocr(
lines
=
merge_spans_to_line
(
spans
)
# logger.info(lines)
# 从ocr_page_info中获取layout信息
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks
=
[]
for
line
in
lines
:
...
...
@@ -85,8 +82,11 @@ def parse_pdf_by_ocr(
"lines"
:
[
line
],
})
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
])
# 构造pdf_info_dict
page_info
=
construct_page_component
(
page_id
,
blocks
)
page_info
=
construct_page_component
(
page_id
,
blocks
,
layout_bboxes
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
return
pdf_info_dict
...
...
magic_pdf/pre_proc/ocr_detect_layout.py
0 → 100644
View file @
fcea39d3
from
magic_pdf.libs.boxbase
import
_is_part_overlap
,
_is_in
def
get_center_point
(
bbox
):
"""
根据边界框坐标信息,计算出该边界框的中心点坐标。
Args:
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
Returns:
list: 中心点坐标信息,包含两个元素,分别为x坐标和y坐标。
"""
return
[(
bbox
[
0
]
+
bbox
[
2
])
/
2
,
(
bbox
[
1
]
+
bbox
[
3
])
/
2
]
def
get_area
(
bbox
):
"""
根据边界框坐标信息,计算出该边界框的面积。
Args:
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
Returns:
float: 该边界框的面积。
"""
return
(
bbox
[
2
]
-
bbox
[
0
])
*
(
bbox
[
3
]
-
bbox
[
1
])
def
adjust_layouts
(
layout_bboxes
):
# 遍历所有布局框
for
i
in
range
(
len
(
layout_bboxes
)):
# 遍历当前布局框之后的布局框
for
j
in
range
(
i
+
1
,
len
(
layout_bboxes
)):
# 判断两个布局框是否重叠
if
_is_part_overlap
(
layout_bboxes
[
i
],
layout_bboxes
[
j
]):
# 计算每个布局框的中心点坐标和面积
center_i
=
get_center_point
(
layout_bboxes
[
i
][
"layout_bbox"
])
area_i
=
get_area
(
layout_bboxes
[
i
][
"layout_bbox"
])
center_j
=
get_center_point
(
layout_bboxes
[
j
][
"layout_bbox"
])
area_j
=
get_area
(
layout_bboxes
[
j
][
"layout_bbox"
])
# 计算横向和纵向的距离差
dx
=
abs
(
center_i
[
0
]
-
center_j
[
0
])
dy
=
abs
(
center_i
[
1
]
-
center_j
[
1
])
# 较大布局框和较小布局框的赋值
if
area_i
>
area_j
:
larger_layout
,
smaller_layout
=
layout_bboxes
[
i
],
layout_bboxes
[
j
]
else
:
larger_layout
,
smaller_layout
=
layout_bboxes
[
j
],
layout_bboxes
[
i
]
# 根据距离差判断重叠方向并修正边界
if
dx
>
dy
:
# 左右重叠
if
larger_layout
[
"layout_bbox"
][
0
]
<
smaller_layout
[
"layout_bbox"
][
2
]:
larger_layout
[
"layout_bbox"
][
0
]
=
smaller_layout
[
"layout_bbox"
][
2
]
else
:
larger_layout
[
"layout_bbox"
][
2
]
=
smaller_layout
[
"layout_bbox"
][
0
]
else
:
# 上下重叠
if
larger_layout
[
"layout_bbox"
][
1
]
<
smaller_layout
[
"layout_bbox"
][
3
]:
larger_layout
[
"layout_bbox"
][
1
]
=
smaller_layout
[
"layout_bbox"
][
3
]
else
:
larger_layout
[
"layout_bbox"
][
3
]
=
smaller_layout
[
"layout_bbox"
][
1
]
# 返回排序调整后的布局边界框列表
return
layout_bboxes
def
layout_detect
(
layout_info
):
"""
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
Args:
layout_info (list): 包含子布局信息的列表,每个子布局信息为字典类型,包含'poly'字段,表示子布局的边界框坐标信息。
Returns:
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
"""
# 初始化布局边界框列表
layout_bboxes
=
[]
# 遍历每个子布局
for
sub_layout
in
layout_info
:
# 提取子布局的边界框坐标信息
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
sub_layout
[
'poly'
]
# 创建子布局的边界框字典
layout_bbox
=
{
"layout_bbox"
:
[
x0
,
y0
,
x1
,
y1
],
}
# 将子布局的边界框添加到列表中
layout_bboxes
.
append
(
layout_bbox
)
# 初始化新的布局边界框列表
new_layout_bboxes
=
[]
# 遍历每个布局边界框
for
i
in
range
(
len
(
layout_bboxes
)):
# 初始化标记变量,用于判断当前边界框是否需要保留
keep
=
True
# 获取当前边界框的坐标信息
box_i
=
layout_bboxes
[
i
][
"layout_bbox"
]
# 遍历其他边界框
for
j
in
range
(
len
(
layout_bboxes
)):
# 排除当前边界框自身
if
i
!=
j
:
# 获取其他边界框的坐标信息
box_j
=
layout_bboxes
[
j
][
"layout_bbox"
]
# 检测box_i是否被box_j包含
if
_is_in
(
box_i
,
box_j
):
# 如果当前边界框被其他边界框包含,则标记为不需要保留
keep
=
False
# 跳出内层循环
break
# 如果当前边界框需要保留,则添加到新的布局边界框列表中
if
keep
:
new_layout_bboxes
.
append
(
layout_bboxes
[
i
])
# 对新的布局边界框列表进行排序调整
layout_bboxes
=
adjust_layouts
(
new_layout_bboxes
)
# 返回排序调整后的布局边界框列表
return
layout_bboxes
magic_pdf/
libs
/ocr_dict_merge.py
→
magic_pdf/
pre_proc
/ocr_dict_merge.py
View file @
fcea39d3
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment