Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
f9bd0040
Commit
f9bd0040
authored
Mar 08, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
对模型的layout坐标转换
parent
f62d1aa7
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
26 additions
and
24 deletions
+26
-24
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+17
-18
ocr_detect_layout.py
magic_pdf/pre_proc/ocr_detect_layout.py
+9
-6
No files found.
magic_pdf/pdf_parse_by_ocr.py
View file @
f9bd0040
...
@@ -55,7 +55,6 @@ def parse_pdf_by_ocr(
...
@@ -55,7 +55,6 @@ def parse_pdf_by_ocr(
with
open
(
pdf_local_path
+
".pdf"
,
"wb"
)
as
pdf_file
:
with
open
(
pdf_local_path
+
".pdf"
,
"wb"
)
as
pdf_file
:
pdf_file
.
write
(
pdf_bytes
)
pdf_file
.
write
(
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
# 初始化空的pdf_info_dict
# 初始化空的pdf_info_dict
pdf_info_dict
=
{}
pdf_info_dict
=
{}
...
@@ -83,7 +82,8 @@ def parse_pdf_by_ocr(
...
@@ -83,7 +82,8 @@ def parse_pdf_by_ocr(
page_no_bboxes
=
parse_pageNos
(
page_id
,
page
,
ocr_page_info
)
page_no_bboxes
=
parse_pageNos
(
page_id
,
page
,
ocr_page_info
)
header_bboxes
=
parse_headers
(
page_id
,
page
,
ocr_page_info
)
header_bboxes
=
parse_headers
(
page_id
,
page
,
ocr_page_info
)
footer_bboxes
=
parse_footers
(
page_id
,
page
,
ocr_page_info
)
footer_bboxes
=
parse_footers
(
page_id
,
page
,
ocr_page_info
)
footnote_bboxes
=
parse_footnotes_by_model
(
page_id
,
page
,
ocr_page_info
,
md_bookname_save_path
,
debug_mode
=
debug_mode
)
footnote_bboxes
=
parse_footnotes_by_model
(
page_id
,
page
,
ocr_page_info
,
md_bookname_save_path
,
debug_mode
=
debug_mode
)
# 构建需要remove的bbox列表
# 构建需要remove的bbox列表
need_remove_spans_bboxes
=
[]
need_remove_spans_bboxes
=
[]
...
@@ -103,7 +103,8 @@ def parse_pdf_by_ocr(
...
@@ -103,7 +103,8 @@ def parse_pdf_by_ocr(
allow_category_id_list
=
[
1
,
7
,
13
,
14
,
15
]
allow_category_id_list
=
[
1
,
7
,
13
,
14
,
15
]
if
category_id
in
allow_category_id_list
:
if
category_id
in
allow_category_id_list
:
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
layout_det
[
'poly'
]
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
layout_det
[
'poly'
]
bbox
=
[
int
(
x0
/
horizontal_scale_ratio
),
int
(
y0
/
vertical_scale_ratio
),
int
(
x1
/
horizontal_scale_ratio
),
int
(
y1
/
vertical_scale_ratio
)]
bbox
=
[
int
(
x0
/
horizontal_scale_ratio
),
int
(
y0
/
vertical_scale_ratio
),
int
(
x1
/
horizontal_scale_ratio
),
int
(
y1
/
vertical_scale_ratio
)]
'''要删除的'''
'''要删除的'''
# 3: 'header', # 页眉
# 3: 'header', # 页眉
# 4: 'page number', # 页码
# 4: 'page number', # 页码
...
@@ -149,9 +150,11 @@ def parse_pdf_by_ocr(
...
@@ -149,9 +150,11 @@ def parse_pdf_by_ocr(
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
],
page
)
# 将spans合并成line(从上到下,从左到右)
# 将spans合并成line(
在layout内,
从上到下,从左到右)
lines
=
merge_spans_to_line
(
spans
)
lines
=
merge_spans_to_line
(
spans
,
layout_bboxes
)
# logger.info(lines)
# logger.info(lines)
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
...
@@ -162,12 +165,8 @@ def parse_pdf_by_ocr(
...
@@ -162,12 +165,8 @@ def parse_pdf_by_ocr(
"lines"
:
[
line
],
"lines"
:
[
line
],
})
})
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
])
# 构造pdf_info_dict
# 构造pdf_info_dict
page_info
=
construct_page_component
(
page_id
,
blocks
,
layout_bboxes
)
page_info
=
construct_page_component
(
page_id
,
blocks
,
layout_bboxes
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
return
pdf_info_dict
return
pdf_info_dict
magic_pdf/pre_proc/ocr_detect_layout.py
View file @
f9bd0040
import
fitz
from
magic_pdf.libs.boxbase
import
_is_part_overlap
,
_is_in
from
magic_pdf.libs.boxbase
import
_is_part_overlap
,
_is_in
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
def
get_center_point
(
bbox
):
def
get_center_point
(
bbox
):
"""
"""
...
@@ -62,9 +66,7 @@ def adjust_layouts(layout_bboxes):
...
@@ -62,9 +66,7 @@ def adjust_layouts(layout_bboxes):
return
layout_bboxes
return
layout_bboxes
def
layout_detect
(
layout_info
,
page
:
fitz
.
Page
):
def
layout_detect
(
layout_info
):
"""
"""
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
...
@@ -75,15 +77,18 @@ def layout_detect(layout_info):
...
@@ -75,15 +77,18 @@ def layout_detect(layout_info):
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
"""
"""
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
layout_info
,
page
)
# 初始化布局边界框列表
# 初始化布局边界框列表
layout_bboxes
=
[]
layout_bboxes
=
[]
# 遍历每个子布局
# 遍历每个子布局
for
sub_layout
in
layout_info
:
for
sub_layout
in
layout_info
:
# 提取子布局的边界框坐标信息
# 提取子布局的边界框坐标信息
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
sub_layout
[
'poly'
]
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
sub_layout
[
'poly'
]
bbox
=
[
int
(
x0
/
horizontal_scale_ratio
),
int
(
y0
/
vertical_scale_ratio
),
int
(
x1
/
horizontal_scale_ratio
),
int
(
y1
/
vertical_scale_ratio
)]
# 创建子布局的边界框字典
# 创建子布局的边界框字典
layout_bbox
=
{
layout_bbox
=
{
"layout_bbox"
:
[
x0
,
y0
,
x1
,
y1
]
,
"layout_bbox"
:
bbox
,
}
}
# 将子布局的边界框添加到列表中
# 将子布局的边界框添加到列表中
layout_bboxes
.
append
(
layout_bbox
)
layout_bboxes
.
append
(
layout_bbox
)
...
@@ -119,5 +124,3 @@ def layout_detect(layout_info):
...
@@ -119,5 +124,3 @@ def layout_detect(layout_info):
# 返回排序调整后的布局边界框列表
# 返回排序调整后的布局边界框列表
return
layout_bboxes
return
layout_bboxes
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment