Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
71a042d9
Commit
71a042d9
authored
Mar 05, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
footnote检测逻辑更新
parent
779d2e8a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
16 additions
and
6 deletions
+16
-6
remove_footnote.py
magic_pdf/post_proc/remove_footnote.py
+2
-1
detect_footnote.py
magic_pdf/pre_proc/detect_footnote.py
+14
-5
No files found.
magic_pdf/post_proc/remove_footnote.py
View file @
71a042d9
...
@@ -75,7 +75,8 @@ def merge_footnote_blocks(page_info, main_text_font):
...
@@ -75,7 +75,8 @@ def merge_footnote_blocks(page_info, main_text_font):
is_below
(
block
[
'bbox'
],
footnote_bbox
)
and
is_below
(
block
[
'bbox'
],
footnote_bbox
)
and
sum
([
size
>=
main_text_size
,
sum
([
size
>=
main_text_size
,
len
(
block
[
'lines'
])
>=
5
,
len
(
block
[
'lines'
])
>=
5
,
block_font
==
main_text_font
])
>=
2
]
block_font
==
main_text_font
])
>=
2
]
# 如果main_text_bboxes_below不为空,说明footnote下面有正文block,这个footnote不成立,跳过
# 如果main_text_bboxes_below不为空,说明footnote下面有正文block,这个footnote不成立,跳过
if
len
(
main_text_bboxes_below
)
>
0
:
if
len
(
main_text_bboxes_below
)
>
0
:
continue
continue
...
...
magic_pdf/pre_proc/detect_footnote.py
View file @
71a042d9
...
@@ -104,7 +104,8 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_
...
@@ -104,7 +104,8 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_
list: 符合规则的脚注文本块的边界框列表。
list: 符合规则的脚注文本块的边界框列表。
"""
"""
if
page_id
>
20
:
# if page_id > 20:
if
page_id
>
2
:
# 为保证精确度,先只筛选前3页
return
[]
return
[]
else
:
else
:
# 存储每一行的文本块大小的列表
# 存储每一行的文本块大小的列表
...
@@ -128,7 +129,7 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_
...
@@ -128,7 +129,7 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_
block_line_sizes
.
append
(
line_size
)
block_line_sizes
.
append
(
line_size
)
span_font
=
[(
span
[
'font'
],
len
(
span
[
'text'
]))
for
span
in
line
[
'spans'
]
if
'font'
in
span
and
len
(
span
[
'text'
])
>
0
]
span_font
=
[(
span
[
'font'
],
len
(
span
[
'text'
]))
for
span
in
line
[
'spans'
]
if
'font'
in
span
and
len
(
span
[
'text'
])
>
0
]
if
span_font
:
if
span_font
:
#
# todo
main_text_font应该用基于字数最多的字体而不是span级别的统计
# main_text_font应该用基于字数最多的字体而不是span级别的统计
# font_names.append(font_name for font_name in span_font)
# font_names.append(font_name for font_name in span_font)
# block_fonts.append(font_name for font_name in span_font)
# block_fonts.append(font_name for font_name in span_font)
for
font
,
count
in
span_font
:
for
font
,
count
in
span_font
:
...
@@ -158,9 +159,17 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_
...
@@ -158,9 +159,17 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_
# and len(block['lines']) < 5]
# and len(block['lines']) < 5]
footnote_bboxes
=
[
block
[
'bbox'
]
for
block
,
block_size
,
block_font
in
block_sizes
if
footnote_bboxes
=
[
block
[
'bbox'
]
for
block
,
block_size
,
block_font
in
block_sizes
if
block
[
'bbox'
][
1
]
>
page_height
*
0.6
and
block
[
'bbox'
][
1
]
>
page_height
*
0.6
and
sum
([
block_size
<
main_text_size
,
# 较为严格的规则
len
(
block
[
'lines'
])
<
5
,
block_size
<
main_text_size
and
block_font
!=
main_text_font
])
>=
2
]
(
len
(
block
[
'lines'
])
<
5
or
block_font
!=
main_text_font
)]
# 较为宽松的规则
# sum([block_size < main_text_size,
# len(block['lines']) < 5,
# block_font != main_text_font])
# >= 2]
return
footnote_bboxes
return
footnote_bboxes
else
:
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment