Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
59d817b1
Unverified
Commit
59d817b1
authored
Mar 12, 2024
by
myhloli
Committed by
GitHub
Mar 12, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #2 from icecraft/feat/proc_bbox
Feat/proc bbox
parents
070139a5
2611e853
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
109 additions
and
42 deletions
+109
-42
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+66
-42
remove_bbox_overlap.py
magic_pdf/pre_proc/remove_bbox_overlap.py
+43
-0
No files found.
magic_pdf/pdf_parse_by_ocr.py
View file @
59d817b1
...
@@ -5,7 +5,14 @@ import time
...
@@ -5,7 +5,14 @@ import time
from
loguru
import
logger
from
loguru
import
logger
from
demo.draw_bbox
import
draw_layout_bbox
,
draw_text_bbox
from
demo.draw_bbox
import
draw_layout_bbox
,
draw_text_bbox
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
fitz
,
get_img_s3_client
,
get_delta_time
,
get_docx_model_output
from
magic_pdf.libs.commons
import
(
read_file
,
join_path
,
fitz
,
get_img_s3_client
,
get_delta_time
,
get_docx_model_output
,
)
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.pre_proc.detect_footer_by_model
import
parse_footers
from
magic_pdf.pre_proc.detect_footer_by_model
import
parse_footers
...
@@ -14,8 +21,12 @@ from magic_pdf.pre_proc.detect_header import parse_headers
...
@@ -14,8 +21,12 @@ from magic_pdf.pre_proc.detect_header import parse_headers
from
magic_pdf.pre_proc.detect_page_number
import
parse_pageNos
from
magic_pdf.pre_proc.detect_page_number
import
parse_pageNos
from
magic_pdf.pre_proc.ocr_cut_image
import
cut_image_and_table
from
magic_pdf.pre_proc.ocr_cut_image
import
cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_dict_merge
import
remove_overlaps_min_spans
,
merge_spans_to_line_by_layout
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
remove_overlaps_min_spans
,
merge_spans_to_line_by_layout
,
)
from
magic_pdf.pre_proc.ocr_remove_spans
import
remove_spans_by_bboxes
from
magic_pdf.pre_proc.ocr_remove_spans
import
remove_spans_by_bboxes
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
def
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
):
def
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
):
...
@@ -81,18 +92,23 @@ def parse_pdf_by_ocr(
...
@@ -81,18 +92,23 @@ def parse_pdf_by_ocr(
if
debug_mode
:
if
debug_mode
:
time_now
=
time
.
time
()
time_now
=
time
.
time
()
logger
.
info
(
f
"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
)
logger
.
info
(
f
"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
)
start_time
=
time_now
start_time
=
time_now
# 获取当前页的模型数据
# 获取当前页的模型数据
ocr_page_info
=
get_docx_model_output
(
pdf_model_output
,
pdf_model_profile
,
page_id
)
ocr_page_info
=
get_docx_model_output
(
pdf_model_output
,
pdf_model_profile
,
page_id
)
"""从json中获取每页的页码、页眉、页脚的bbox"""
"""从json中获取每页的页码、页眉、页脚的bbox"""
page_no_bboxes
=
parse_pageNos
(
page_id
,
page
,
ocr_page_info
)
page_no_bboxes
=
parse_pageNos
(
page_id
,
page
,
ocr_page_info
)
header_bboxes
=
parse_headers
(
page_id
,
page
,
ocr_page_info
)
header_bboxes
=
parse_headers
(
page_id
,
page
,
ocr_page_info
)
footer_bboxes
=
parse_footers
(
page_id
,
page
,
ocr_page_info
)
footer_bboxes
=
parse_footers
(
page_id
,
page
,
ocr_page_info
)
footnote_bboxes
=
parse_footnotes_by_model
(
page_id
,
page
,
ocr_page_info
,
md_bookname_save_path
,
footnote_bboxes
=
parse_footnotes_by_model
(
debug_mode
=
debug_mode
)
page_id
,
page
,
ocr_page_info
,
md_bookname_save_path
,
debug_mode
=
debug_mode
)
# 构建需要remove的bbox列表
# 构建需要remove的bbox列表
need_remove_spans_bboxes
=
[]
need_remove_spans_bboxes
=
[]
...
@@ -101,51 +117,57 @@ def parse_pdf_by_ocr(
...
@@ -101,51 +117,57 @@ def parse_pdf_by_ocr(
need_remove_spans_bboxes
.
extend
(
footer_bboxes
)
need_remove_spans_bboxes
.
extend
(
footer_bboxes
)
need_remove_spans_bboxes
.
extend
(
footnote_bboxes
)
need_remove_spans_bboxes
.
extend
(
footnote_bboxes
)
layout_dets
=
ocr_page_info
[
'layout_dets'
]
layout_dets
=
ocr_page_info
[
"layout_dets"
]
spans
=
[]
spans
=
[]
# 计算模型坐标和pymu坐标的缩放比例
# 计算模型坐标和pymu坐标的缩放比例
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
ocr_page_info
,
page
)
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
ocr_page_info
,
page
)
for
layout_det
in
layout_dets
:
for
layout_det
in
layout_dets
:
category_id
=
layout_det
[
'category_id'
]
category_id
=
layout_det
[
"category_id"
]
allow_category_id_list
=
[
1
,
7
,
13
,
14
,
15
]
allow_category_id_list
=
[
1
,
7
,
13
,
14
,
15
]
if
category_id
in
allow_category_id_list
:
if
category_id
in
allow_category_id_list
:
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
layout_det
[
'poly'
]
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
layout_det
[
"poly"
]
bbox
=
[
int
(
x0
/
horizontal_scale_ratio
),
int
(
y0
/
vertical_scale_ratio
),
bbox
=
[
int
(
x1
/
horizontal_scale_ratio
),
int
(
y1
/
vertical_scale_ratio
)]
int
(
x0
/
horizontal_scale_ratio
),
'''要删除的'''
int
(
y0
/
vertical_scale_ratio
),
int
(
x1
/
horizontal_scale_ratio
),
int
(
y1
/
vertical_scale_ratio
),
]
"""要删除的"""
# 3: 'header', # 页眉
# 3: 'header', # 页眉
# 4: 'page number', # 页码
# 4: 'page number', # 页码
# 5: 'footnote', # 脚注
# 5: 'footnote', # 脚注
# 6: 'footer', # 页脚
# 6: 'footer', # 页脚
'''当成span拼接的'''
"""当成span拼接的"""
# 1: 'image', # 图片
# 1: 'image', # 图片
# 7: 'table', # 表格
# 7: 'table', # 表格
# 13: 'inline_equation', # 行内公式
# 13: 'inline_equation', # 行内公式
# 14: 'displayed_equation', # 行间公式
# 14: 'displayed_equation', # 行间公式
# 15: 'text', # ocr识别文本
# 15: 'text', # ocr识别文本
'''layout信息'''
"""layout信息"""
# 11: 'full column', # 单栏
# 11: 'full column', # 单栏
# 12: 'sub column', # 多栏
# 12: 'sub column', # 多栏
span
=
{
span
=
{
'bbox'
:
bbox
,
"bbox"
:
bbox
,
}
}
if
category_id
==
1
:
if
category_id
==
1
:
span
[
'type'
]
=
'image'
span
[
"type"
]
=
"image"
elif
category_id
==
7
:
elif
category_id
==
7
:
span
[
'type'
]
=
'table'
span
[
"type"
]
=
"table"
elif
category_id
==
13
:
elif
category_id
==
13
:
span
[
'content'
]
=
layout_det
[
'latex'
]
span
[
"content"
]
=
layout_det
[
"latex"
]
span
[
'type'
]
=
'inline_equation'
span
[
"type"
]
=
"inline_equation"
elif
category_id
==
14
:
elif
category_id
==
14
:
span
[
'content'
]
=
layout_det
[
'latex'
]
span
[
"content"
]
=
layout_det
[
"latex"
]
span
[
'type'
]
=
'displayed_equation'
span
[
"type"
]
=
"displayed_equation"
elif
category_id
==
15
:
elif
category_id
==
15
:
span
[
'content'
]
=
layout_det
[
'text'
]
span
[
"content"
]
=
layout_det
[
"text"
]
span
[
'type'
]
=
'text'
span
[
"type"
]
=
"text"
# print(span)
# print(span)
spans
.
append
(
span
)
spans
.
append
(
span
)
else
:
else
:
...
@@ -160,12 +182,12 @@ def parse_pdf_by_ocr(
...
@@ -160,12 +182,12 @@ def parse_pdf_by_ocr(
# 对image和table截图
# 对image和table截图
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
)
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
# 模型识别错误的行间公式, type类型转换成行内公式
# 模型识别错误的行间公式, type类型转换成行内公式
# bbox去除粘连
# bbox去除粘连
spans
=
remove_overlap_between_bbox
(
spans
)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
...
@@ -175,14 +197,15 @@ def parse_pdf_by_ocr(
...
@@ -175,14 +197,15 @@ def parse_pdf_by_ocr(
# 将spans合并成line(在layout内,从上到下,从左到右)
# 将spans合并成line(在layout内,从上到下,从左到右)
lines
=
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
)
lines
=
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
)
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks
=
[]
blocks
=
[]
for
line
in
lines
:
for
line
in
lines
:
blocks
.
append
({
blocks
.
append
(
"bbox"
:
line
[
'bbox'
],
{
"bbox"
:
line
[
"bbox"
],
"lines"
:
[
line
],
"lines"
:
[
line
],
})
}
)
# 构造pdf_info_dict
# 构造pdf_info_dict
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
)
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
)
...
@@ -190,7 +213,9 @@ def parse_pdf_by_ocr(
...
@@ -190,7 +213,9 @@ def parse_pdf_by_ocr(
# 在测试时,保存调试信息
# 在测试时,保存调试信息
if
debug_mode
:
if
debug_mode
:
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
)
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
)
with
open
(
params_file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
with
open
(
params_file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
...
@@ -198,5 +223,4 @@ def parse_pdf_by_ocr(
...
@@ -198,5 +223,4 @@ def parse_pdf_by_ocr(
draw_layout_bbox
(
pdf_info_dict
,
pdf_path
,
md_bookname_save_path
)
draw_layout_bbox
(
pdf_info_dict
,
pdf_path
,
md_bookname_save_path
)
draw_text_bbox
(
pdf_info_dict
,
pdf_path
,
md_bookname_save_path
)
draw_text_bbox
(
pdf_info_dict
,
pdf_path
,
md_bookname_save_path
)
return
pdf_info_dict
return
pdf_info_dict
magic_pdf/pre_proc/remove_bbox_overlap.py
0 → 100644
View file @
59d817b1
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap
,
_is_in
def
_remove_overlap_between_bbox
(
spans
):
res
=
[]
for
v
in
spans
:
for
i
in
range
(
len
(
res
)):
if
_is_in
(
res
[
i
][
"bbox"
],
v
[
"bbox"
]):
continue
if
_is_in_or_part_overlap
(
res
[
i
][
"bbox"
],
v
[
"bbox"
]):
ix0
,
iy0
,
ix1
,
iy1
=
res
[
i
][
"bbox"
]
x0
,
y0
,
x1
,
y1
=
v
[
"bbox"
]
diff_x
=
min
(
x1
,
ix1
)
-
max
(
x0
,
ix0
)
diff_y
=
min
(
y1
,
iy1
)
-
max
(
y0
,
iy0
)
if
diff_y
>
diff_x
:
if
x1
>=
ix1
:
mid
=
(
x0
+
ix1
)
//
2
ix1
=
min
(
mid
,
ix1
)
x0
=
max
(
mid
+
1
,
x0
)
else
:
mid
=
(
ix0
+
x1
)
//
2
ix0
=
max
(
mid
+
1
,
ix0
)
x1
=
min
(
mid
,
x1
)
else
:
if
y1
>=
iy1
:
mid
=
(
y0
+
iy1
)
//
2
y0
=
max
(
mid
+
1
,
y0
)
iy1
=
min
(
iy1
,
mid
)
else
:
mid
=
(
iy0
+
y1
)
//
2
y1
=
min
(
y1
,
mid
)
iy0
=
max
(
mid
+
1
,
iy0
)
res
[
i
][
"bbox"
]
=
[
ix0
,
iy0
,
ix1
,
iy1
]
v
[
"bbox"
]
=
[
x0
,
y0
,
x1
,
y1
]
res
.
append
(
v
)
return
res
def
remove_overlap_between_bbox
(
spans
):
return
_remove_overlap_between_bbox
(
spans
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment