Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
09269c84
Commit
09269c84
authored
Mar 20, 2024
by
许瑞
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat: add extract_train_data
parent
056aed86
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
1591 additions
and
134 deletions
+1591
-134
pdf_parse_for_train.py
magic_pdf/pdf_parse_for_train.py
+684
-0
pipeline.py
magic_pdf/pipeline.py
+310
-134
__init__.py
magic_pdf/train_utils/__init__.py
+0
-0
convert_to_train_format.py
magic_pdf/train_utils/convert_to_train_format.py
+52
-0
extract_caption.py
magic_pdf/train_utils/extract_caption.py
+59
-0
remove_footer_header.py
magic_pdf/train_utils/remove_footer_header.py
+159
-0
vis_utils.py
magic_pdf/train_utils/vis_utils.py
+327
-0
No files found.
magic_pdf/pdf_parse_for_train.py
0 → 100644
View file @
09269c84
This diff is collapsed.
Click to expand it.
magic_pdf/pipeline.py
View file @
09269c84
This diff is collapsed.
Click to expand it.
magic_pdf/train_utils/__init__.py
0 → 100644
View file @
09269c84
magic_pdf/train_utils/convert_to_train_format.py
0 → 100644
View file @
09269c84
def
convert_to_train_format
(
jso
:
dict
)
->
[]:
pages
=
[]
for
k
,
v
in
jso
.
items
():
page_idx
=
v
[
"page_idx"
]
width
,
height
=
v
[
"page_size"
]
info
=
{
"page_info"
:
{
"page_no"
:
page_idx
,
"height"
:
height
,
"width"
:
width
}}
bboxes
:
list
[
dict
]
=
[]
for
img_bbox
in
v
[
"image_bboxes_with_caption"
]:
bbox
=
{
"category_id"
:
1
,
"bbox"
:
img_bbox
[
"bbox"
]}
if
"caption"
in
img_bbox
:
bbox
[
"caption_bbox"
]
=
img_bbox
[
"caption"
]
bboxes
.
append
(
bbox
)
for
tbl_bbox
in
v
[
"table_bboxes_with_caption"
]:
bbox
=
{
"category_id"
:
7
,
"bbox"
:
tbl_bbox
[
"bbox"
]}
if
"caption"
in
tbl_bbox
:
bbox
[
"caption_bbox"
]
=
tbl_bbox
[
"caption"
]
bboxes
.
append
(
bbox
)
for
bbox
in
v
[
"bak_page_no_bboxes"
]:
n_bbox
=
{
"category_id"
:
4
,
"bbox"
:
bbox
}
bboxes
.
append
(
n_bbox
)
for
bbox
in
v
[
"bak_header_bboxes"
]:
n_bbox
=
{
"category_id"
:
3
,
"bbox"
:
bbox
}
bboxes
.
append
(
n_bbox
)
for
bbox
in
v
[
"bak_footer_bboxes"
]:
n_bbox
=
{
"category_id"
:
6
,
"bbox"
:
bbox
}
bboxes
.
append
(
n_bbox
)
# 脚注, 目前没有看到例子
for
para
in
v
[
"para_blocks"
]:
n_bbox
=
{
"category_id"
:
2
,
"bbox"
:
para
[
"bbox"
]}
bboxes
.
append
(
n_bbox
)
for
inline_equation
in
v
[
"inline_equations"
]:
n_bbox
=
{
"category_id"
:
13
,
"bbox"
:
inline_equation
[
"bbox"
]}
bboxes
.
append
(
n_bbox
)
for
inter_equation
in
v
[
"interline_equations"
]:
n_bbox
=
{
"category_id"
:
10
,
"bbox"
:
inter_equation
[
"bbox"
]}
bboxes
.
append
(
n_bbox
)
info
[
"bboxes"
]
=
bboxes
pages
.
append
(
info
)
return
pages
magic_pdf/train_utils/extract_caption.py
0 → 100644
View file @
09269c84
from
magic_pdf.libs.boxbase
import
_is_in
def
extract_caption_bbox
(
outer
:
list
,
inner
:
list
)
->
list
:
"""
ret: list of {
"bbox": [1,2,3,4],
"caption": [5,6,7,8] # may existed
}
"""
found_count
=
0
# for debug
print
(
outer
,
inner
)
def
is_float_equal
(
a
,
b
):
if
0.01
>
abs
(
a
-
b
):
# non strict float equal compare
return
True
return
False
outer_h
=
{
i
:
outer
[
i
]
for
i
in
range
(
len
(
outer
))}
ret
=
[]
for
v
in
inner
:
ix0
,
iy0
,
ix1
,
iy1
=
v
found_idx
=
None
d
=
{
"bbox"
:
v
[:
4
]}
for
k
in
outer_h
:
ox0
,
oy0
,
ox1
,
oy1
=
outer_h
[
k
]
equal_float_flags
=
[
is_float_equal
(
ix0
,
ox0
),
is_float_equal
(
iy0
,
oy0
),
is_float_equal
(
ix1
,
ox1
),
is_float_equal
(
iy1
,
oy1
),
]
if
_is_in
(
v
,
outer_h
[
k
])
and
not
all
(
equal_float_flags
):
found_idx
=
k
break
if
found_idx
is
not
None
:
found_count
+=
1
captions
:
list
[
list
]
=
[]
ox0
,
oy0
,
ox1
,
oy1
=
outer_h
[
found_idx
]
captions
=
[
[
ox0
,
oy0
,
ix0
,
oy1
],
[
ox0
,
oy0
,
ox1
,
iy0
],
[
ox0
,
iy1
,
ox1
,
oy1
],
[
ix1
,
oy0
,
ox1
,
oy1
],
]
captions
=
sorted
(
captions
,
key
=
lambda
rect
:
abs
(
rect
[
0
]
-
rect
[
2
])
*
abs
(
rect
[
1
]
-
rect
[
3
]),
)
# 面积最大的框就是caption
d
[
"caption"
]
=
captions
[
-
1
]
outer_h
.
pop
(
found_idx
)
# 同一个 outer box 只能用于确定一个 inner box 的 caption 位置。
ret
.
append
(
d
)
print
(
"found_count: "
,
found_count
)
return
ret
magic_pdf/train_utils/remove_footer_header.py
0 → 100644
View file @
09269c84
import
re
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap
from
magic_pdf.libs.drop_tag
import
CONTENT_IN_FOOT_OR_HEADER
,
PAGE_NO
"""
copy from pre_proc/remove_footer_header.py
"""
def
remove_headder_footer_one_page
(
text_raw_blocks
,
image_bboxes
,
table_bboxes
,
header_bboxs
,
footer_bboxs
,
page_no_bboxs
,
page_w
,
page_h
,
):
"""
删除页眉页脚,页码
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
"""
if
1
:
return
image_bboxes
,
table_bboxes
,
text_raw_blocks
,
[],
[],
[]
header
=
[]
footer
=
[]
if
len
(
header
)
==
0
:
model_header
=
header_bboxs
if
model_header
:
x0
=
min
([
x
for
x
,
_
,
_
,
_
in
model_header
])
y0
=
min
([
y
for
_
,
y
,
_
,
_
in
model_header
])
x1
=
max
([
x1
for
_
,
_
,
x1
,
_
in
model_header
])
y1
=
max
([
y1
for
_
,
_
,
_
,
y1
in
model_header
])
header
=
[
x0
,
y0
,
x1
,
y1
]
if
len
(
footer
)
==
0
:
model_footer
=
footer_bboxs
if
model_footer
:
x0
=
min
([
x
for
x
,
_
,
_
,
_
in
model_footer
])
y0
=
min
([
y
for
_
,
y
,
_
,
_
in
model_footer
])
x1
=
max
([
x1
for
_
,
_
,
x1
,
_
in
model_footer
])
y1
=
max
([
y1
for
_
,
_
,
_
,
y1
in
model_footer
])
footer
=
[
x0
,
y0
,
x1
,
y1
]
header_y0
=
0
if
len
(
header
)
==
0
else
header
[
3
]
footer_y0
=
page_h
if
len
(
footer
)
==
0
else
footer
[
1
]
if
page_no_bboxs
:
top_part
=
[
b
for
b
in
page_no_bboxs
if
b
[
3
]
<
page_h
/
2
]
btn_part
=
[
b
for
b
in
page_no_bboxs
if
b
[
1
]
>
page_h
/
2
]
top_max_y0
=
max
([
b
[
1
]
for
b
in
top_part
])
if
top_part
else
0
btn_min_y1
=
min
([
b
[
3
]
for
b
in
btn_part
])
if
btn_part
else
page_h
header_y0
=
max
(
header_y0
,
top_max_y0
)
footer_y0
=
min
(
footer_y0
,
btn_min_y1
)
content_boundry
=
[
0
,
header_y0
,
page_w
,
footer_y0
]
header
=
[
0
,
0
,
page_w
,
header_y0
]
footer
=
[
0
,
footer_y0
,
page_w
,
page_h
]
"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
text_block_to_remove
=
[]
# 首先检查每个textblock
for
blk
in
text_raw_blocks
:
if
len
(
blk
[
"lines"
])
>
0
:
for
line
in
blk
[
"lines"
]:
line_del
=
[]
for
span
in
line
[
"spans"
]:
span_del
=
[]
if
span
[
"bbox"
][
3
]
<
header_y0
:
span_del
.
append
(
span
)
elif
_is_in_or_part_overlap
(
span
[
"bbox"
],
header
)
or
_is_in_or_part_overlap
(
span
[
"bbox"
],
footer
):
span_del
.
append
(
span
)
for
span
in
span_del
:
line
[
"spans"
]
.
remove
(
span
)
if
not
line
[
"spans"
]:
line_del
.
append
(
line
)
for
line
in
line_del
:
blk
[
"lines"
]
.
remove
(
line
)
else
:
# if not blk['lines']:
blk
[
"tag"
]
=
CONTENT_IN_FOOT_OR_HEADER
text_block_to_remove
.
append
(
blk
)
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
page_no_block_2_remove
=
[]
if
page_no_bboxs
:
for
pagenobox
in
page_no_bboxs
:
for
block
in
text_raw_blocks
:
if
_is_in_or_part_overlap
(
pagenobox
,
block
[
"bbox"
]
):
# 在span级别删除页码
for
line
in
block
[
"lines"
]:
for
span
in
line
[
"spans"
]:
if
_is_in_or_part_overlap
(
pagenobox
,
span
[
"bbox"
]):
# span['text'] = ''
span
[
"tag"
]
=
PAGE_NO
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
if
len
(
line
[
"spans"
])
==
1
and
len
(
block
[
"lines"
])
==
1
:
page_no_block_2_remove
.
append
(
block
)
else
:
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
if
len
(
text_raw_blocks
)
>
0
:
text_raw_blocks
.
sort
(
key
=
lambda
x
:
x
[
"bbox"
][
1
],
reverse
=
True
)
last_block
=
text_raw_blocks
[
0
]
if
len
(
last_block
[
"lines"
])
==
1
:
last_line
=
last_block
[
"lines"
][
0
]
if
len
(
last_line
[
"spans"
])
==
1
:
last_span
=
last_line
[
"spans"
][
0
]
if
(
last_span
[
"text"
]
.
strip
()
and
not
re
.
search
(
"[a-zA-Z]"
,
last_span
[
"text"
])
and
re
.
search
(
"[0-9]"
,
last_span
[
"text"
])
):
last_span
[
"tag"
]
=
PAGE_NO
page_no_block_2_remove
.
append
(
last_block
)
for
b
in
page_no_block_2_remove
:
text_block_to_remove
.
append
(
b
)
for
blk
in
text_block_to_remove
:
if
blk
in
text_raw_blocks
:
text_raw_blocks
.
remove
(
blk
)
text_block_remain
=
text_raw_blocks
image_bbox_to_remove
=
[
bbox
for
bbox
in
image_bboxes
if
not
_is_in_or_part_overlap
(
bbox
,
content_boundry
)
]
image_bbox_remain
=
[
bbox
for
bbox
in
image_bboxes
if
_is_in_or_part_overlap
(
bbox
,
content_boundry
)
]
table_bbox_to_remove
=
[
bbox
for
bbox
in
table_bboxes
if
not
_is_in_or_part_overlap
(
bbox
,
content_boundry
)
]
table_bbox_remain
=
[
bbox
for
bbox
in
table_bboxes
if
_is_in_or_part_overlap
(
bbox
,
content_boundry
)
]
# 1, 2, 3
return
(
image_bbox_remain
,
table_bbox_remain
,
text_block_remain
,
text_block_to_remove
,
image_bbox_to_remove
,
table_bbox_to_remove
,
)
magic_pdf/train_utils/vis_utils.py
0 → 100644
View file @
09269c84
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment