Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
39e3a7b3
Unverified
Commit
39e3a7b3
authored
Apr 22, 2024
by
myhloli
Committed by
GitHub
Apr 22, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #49 from myhloli/master
文本框与标题框重叠,优先信任文本框
parents
c300c92b
83641d3d
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
31 additions
and
20 deletions
+31
-20
magic_model.py
magic_pdf/model/magic_model.py
+29
-18
ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+2
-2
No files found.
magic_pdf/model/magic_model.py
View file @
39e3a7b3
...
@@ -21,8 +21,8 @@ class MagicModel:
...
@@ -21,8 +21,8 @@ class MagicModel:
"""
"""
def
__fix_axis
(
self
):
def
__fix_axis
(
self
):
need_remove_list
=
[]
for
model_page_info
in
self
.
__model_list
:
for
model_page_info
in
self
.
__model_list
:
need_remove_list
=
[]
page_no
=
model_page_info
[
"page_info"
][
"page_no"
]
page_no
=
model_page_info
[
"page_info"
][
"page_no"
]
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
model_page_info
,
self
.
__docs
[
page_no
]
model_page_info
,
self
.
__docs
[
page_no
]
...
@@ -43,12 +43,24 @@ class MagicModel:
...
@@ -43,12 +43,24 @@ class MagicModel:
for
need_remove
in
need_remove_list
:
for
need_remove
in
need_remove_list
:
layout_dets
.
remove
(
need_remove
)
layout_dets
.
remove
(
need_remove
)
def
__fix_by_confidence
(
self
):
for
model_page_info
in
self
.
__model_list
:
need_remove_list
=
[]
layout_dets
=
model_page_info
[
"layout_dets"
]
for
layout_det
in
layout_dets
:
if
layout_det
[
"score"
]
<
0.6
:
need_remove_list
.
append
(
layout_det
)
else
:
continue
for
need_remove
in
need_remove_list
:
layout_dets
.
remove
(
need_remove
)
def
__init__
(
self
,
model_list
:
list
,
docs
:
fitz
.
Document
):
def
__init__
(
self
,
model_list
:
list
,
docs
:
fitz
.
Document
):
self
.
__model_list
=
model_list
self
.
__model_list
=
model_list
self
.
__docs
=
docs
self
.
__docs
=
docs
self
.
__fix_axis
()
self
.
__fix_axis
()
#@todo 移除置信度小于0.6的所有block
#@TODO 删除掉一些低置信度的会导致分段错误,后面再修复
# self.__fix_by_confidence()
def
__reduct_overlap
(
self
,
bboxes
):
def
__reduct_overlap
(
self
,
bboxes
):
N
=
len
(
bboxes
)
N
=
len
(
bboxes
)
...
@@ -63,13 +75,13 @@ class MagicModel:
...
@@ -63,13 +75,13 @@ class MagicModel:
return
[
bboxes
[
i
]
for
i
in
range
(
N
)
if
keep
[
i
]]
return
[
bboxes
[
i
]
for
i
in
range
(
N
)
if
keep
[
i
]]
def
__tie_up_category_by_distance
(
def
__tie_up_category_by_distance
(
self
,
page_no
,
subject_category_id
,
object_category_id
self
,
page_no
,
subject_category_id
,
object_category_id
):
):
"""
"""
假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object),每个 object 只能属于一个 subject
假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object),每个 object 只能属于一个 subject
"""
"""
ret
=
[]
ret
=
[]
MAX_DIS_OF_POINT
=
10
**
9
+
7
MAX_DIS_OF_POINT
=
10
**
9
+
7
subjects
=
self
.
__reduct_overlap
(
subjects
=
self
.
__reduct_overlap
(
list
(
list
(
...
@@ -112,8 +124,8 @@ class MagicModel:
...
@@ -112,8 +124,8 @@ class MagicModel:
for
i
in
range
(
N
):
for
i
in
range
(
N
):
for
j
in
range
(
i
):
for
j
in
range
(
i
):
if
(
if
(
all_bboxes
[
i
][
"category_id"
]
==
subject_category_id
all_bboxes
[
i
][
"category_id"
]
==
subject_category_id
and
all_bboxes
[
j
][
"category_id"
]
==
subject_category_id
and
all_bboxes
[
j
][
"category_id"
]
==
subject_category_id
):
):
continue
continue
...
@@ -143,9 +155,9 @@ class MagicModel:
...
@@ -143,9 +155,9 @@ class MagicModel:
if
pos_flag_count
>
1
:
if
pos_flag_count
>
1
:
continue
continue
if
(
if
(
all_bboxes
[
j
][
"category_id"
]
!=
object_category_id
all_bboxes
[
j
][
"category_id"
]
!=
object_category_id
or
j
in
used
or
j
in
used
or
dis
[
i
][
j
]
==
MAX_DIS_OF_POINT
or
dis
[
i
][
j
]
==
MAX_DIS_OF_POINT
):
):
continue
continue
arr
.
append
((
dis
[
i
][
j
],
j
))
arr
.
append
((
dis
[
i
][
j
],
j
))
...
@@ -174,10 +186,10 @@ class MagicModel:
...
@@ -174,10 +186,10 @@ class MagicModel:
continue
continue
if
(
if
(
all_bboxes
[
k
][
"category_id"
]
!=
object_category_id
all_bboxes
[
k
][
"category_id"
]
!=
object_category_id
or
k
in
used
or
k
in
used
or
k
in
seen
or
k
in
seen
or
dis
[
j
][
k
]
==
MAX_DIS_OF_POINT
or
dis
[
j
][
k
]
==
MAX_DIS_OF_POINT
):
):
continue
continue
is_nearest
=
True
is_nearest
=
True
...
@@ -185,12 +197,10 @@ class MagicModel:
...
@@ -185,12 +197,10 @@ class MagicModel:
if
l
in
(
j
,
k
)
or
l
in
used
or
l
in
seen
:
if
l
in
(
j
,
k
)
or
l
in
used
or
l
in
seen
:
continue
continue
if
not
float_gt
(
dis
[
l
][
k
],
dis
[
j
][
k
]):
if
not
float_gt
(
dis
[
l
][
k
],
dis
[
j
][
k
]):
is_nearest
=
False
is_nearest
=
False
break
break
if
is_nearest
:
if
is_nearest
:
tmp
.
append
(
k
)
tmp
.
append
(
k
)
seen
.
add
(
k
)
seen
.
add
(
k
)
...
@@ -303,8 +313,8 @@ class MagicModel:
...
@@ -303,8 +313,8 @@ class MagicModel:
candidates
=
[]
candidates
=
[]
for
j
in
range
(
N
):
for
j
in
range
(
N
):
if
(
if
(
all_bboxes
[
j
][
"category_id"
]
!=
subject_category_id
all_bboxes
[
j
][
"category_id"
]
!=
subject_category_id
or
j
in
with_caption_subject
or
j
in
with_caption_subject
):
):
continue
continue
candidates
.
append
((
dis
[
i
][
j
],
j
))
candidates
.
append
((
dis
[
i
][
j
],
j
))
...
@@ -326,7 +336,7 @@ class MagicModel:
...
@@ -326,7 +336,7 @@ class MagicModel:
]
]
def
get_tables
(
def
get_tables
(
self
,
page_no
:
int
self
,
page_no
:
int
)
->
list
:
# 3个坐标, caption, table主体,table-note
)
->
list
:
# 3个坐标, caption, table主体,table-note
with_captions
,
_
=
self
.
__tie_up_category_by_distance
(
page_no
,
5
,
6
)
with_captions
,
_
=
self
.
__tie_up_category_by_distance
(
page_no
,
5
,
6
)
with_footnotes
,
_
=
self
.
__tie_up_category_by_distance
(
page_no
,
5
,
7
)
with_footnotes
,
_
=
self
.
__tie_up_category_by_distance
(
page_no
,
5
,
7
)
...
@@ -441,6 +451,7 @@ class MagicModel:
...
@@ -441,6 +451,7 @@ class MagicModel:
blocks
.
append
(
block
)
blocks
.
append
(
block
)
return
blocks
return
blocks
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
drw
=
DiskReaderWriter
(
r"D:/project/20231108code-clean"
)
drw
=
DiskReaderWriter
(
r"D:/project/20231108code-clean"
)
if
0
:
if
0
:
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
39e3a7b3
...
@@ -28,7 +28,7 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
...
@@ -28,7 +28,7 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
InterlineEquation
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
InterlineEquation
,
None
,
None
,
None
,
None
])
'''block嵌套问题解决'''
'''block嵌套问题解决'''
'''文本框与标题框重叠,优先信任
标题
框'''
'''文本框与标题框重叠,优先信任
文本
框'''
all_bboxes
=
fix_text_overlap_title_blocks
(
all_bboxes
)
all_bboxes
=
fix_text_overlap_title_blocks
(
all_bboxes
)
'''任何框体与舍弃框重叠,优先信任舍弃框'''
'''任何框体与舍弃框重叠,优先信任舍弃框'''
all_bboxes
=
remove_need_drop_blocks
(
all_bboxes
,
discarded_blocks
)
all_bboxes
=
remove_need_drop_blocks
(
all_bboxes
,
discarded_blocks
)
...
@@ -60,7 +60,7 @@ def fix_text_overlap_title_blocks(all_bboxes):
...
@@ -60,7 +60,7 @@ def fix_text_overlap_title_blocks(all_bboxes):
text_block_bbox
=
text_block
[
0
],
text_block
[
1
],
text_block
[
2
],
text_block
[
3
]
text_block_bbox
=
text_block
[
0
],
text_block
[
1
],
text_block
[
2
],
text_block
[
3
]
title_block_bbox
=
title_block
[
0
],
title_block
[
1
],
title_block
[
2
],
title_block
[
3
]
title_block_bbox
=
title_block
[
0
],
title_block
[
1
],
title_block
[
2
],
title_block
[
3
]
if
calculate_iou
(
text_block_bbox
,
title_block_bbox
)
>
0.8
:
if
calculate_iou
(
text_block_bbox
,
title_block_bbox
)
>
0.8
:
all_bboxes
.
remove
(
t
ext
_block
)
all_bboxes
.
remove
(
t
itle
_block
)
return
all_bboxes
return
all_bboxes
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment