Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
34a13a89
Commit
34a13a89
authored
Oct 28, 2024
by
icecraft
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix: add priority match rule
parent
73913484
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
61 additions
and
8 deletions
+61
-8
magic_model.py
magic_pdf/model/magic_model.py
+61
-8
No files found.
magic_pdf/model/magic_model.py
View file @
34a13a89
import
enum
import
json
import
json
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.data.dataset
import
Dataset
...
@@ -18,6 +19,14 @@ CAPATION_OVERLAP_AREA_RATIO = 0.6
...
@@ -18,6 +19,14 @@ CAPATION_OVERLAP_AREA_RATIO = 0.6
MERGE_BOX_OVERLAP_AREA_RATIO
=
1.1
MERGE_BOX_OVERLAP_AREA_RATIO
=
1.1
class
PosRelationEnum
(
enum
.
Enum
):
LEFT
=
'left'
RIGHT
=
'right'
UP
=
'up'
BOTTOM
=
'bottom'
ALL
=
'all'
class
MagicModel
:
class
MagicModel
:
"""每个函数没有得到元素的时候返回空list."""
"""每个函数没有得到元素的时候返回空list."""
...
@@ -591,9 +600,23 @@ class MagicModel:
...
@@ -591,9 +600,23 @@ class MagicModel:
return
ret
,
total_subject_object_dis
return
ret
,
total_subject_object_dis
def
__tie_up_category_by_distance_v2
(
def
__tie_up_category_by_distance_v2
(
self
,
page_no
,
subject_category_id
,
object_category_id
self
,
page_no
:
int
,
subject_category_id
:
int
,
object_category_id
:
int
,
priority_pos
:
PosRelationEnum
,
):
):
"""_summary_
Args:
page_no (int): _description_
subject_category_id (int): _description_
object_category_id (int): _description_
priority_pos (PosRelationEnum): _description_
Returns:
_type_: _description_
"""
AXIS_MULPLICITY
=
0.5
AXIS_MULPLICITY
=
0.5
subjects
=
self
.
__reduct_overlap
(
subjects
=
self
.
__reduct_overlap
(
list
(
list
(
...
@@ -680,6 +703,27 @@ class MagicModel:
...
@@ -680,6 +703,27 @@ class MagicModel:
j
,
j
,
bbox_distance
(
obj
[
'bbox'
],
sub
[
'bbox'
]),
bbox_distance
(
obj
[
'bbox'
],
sub
[
'bbox'
]),
]
]
if
(
dis_by_directions
[
'top'
][
i
][
1
]
!=
float
(
'inf'
)
and
dis_by_directions
[
'bottom'
][
i
][
1
]
!=
float
(
'inf'
)
and
priority_pos
in
(
PosRelationEnum
.
BOTTOM
,
PosRelationEnum
.
UP
)
):
RATIO
=
3
if
(
abs
(
dis_by_directions
[
'top'
][
i
][
1
]
-
dis_by_directions
[
'bottom'
][
i
][
1
]
)
<
RATIO
*
axis_unit
):
if
priority_pos
==
PosRelationEnum
.
BOTTOM
:
sub_obj_map_h
[
dis_by_directions
[
'bottom'
][
i
][
0
]]
.
append
(
i
)
else
:
sub_obj_map_h
[
dis_by_directions
[
'top'
][
i
][
0
]]
.
append
(
i
)
continue
if
dis_by_directions
[
'left'
][
i
][
1
]
!=
float
(
'inf'
)
or
dis_by_directions
[
if
dis_by_directions
[
'left'
][
i
][
1
]
!=
float
(
'inf'
)
or
dis_by_directions
[
'right'
'right'
][
i
][
1
]
!=
float
(
'inf'
):
][
i
][
1
]
!=
float
(
'inf'
):
...
@@ -735,9 +779,12 @@ class MagicModel:
...
@@ -735,9 +779,12 @@ class MagicModel:
top_bottom_x_axis
=
top_bottom
[
2
]
-
top_bottom
[
0
]
top_bottom_x_axis
=
top_bottom
[
2
]
-
top_bottom
[
0
]
bottom_top_x_axis
=
bottom_top
[
2
]
-
bottom_top
[
0
]
bottom_top_x_axis
=
bottom_top
[
2
]
-
bottom_top
[
0
]
if
abs
(
top_bottom_x_axis
-
l_x_axis
)
+
dis_by_directions
[
'bottom'
][
i
][
1
]
>
abs
(
if
(
bottom_top_x_axis
-
l_x_axis
abs
(
top_bottom_x_axis
-
l_x_axis
)
)
+
dis_by_directions
[
'top'
][
i
][
1
]:
+
dis_by_directions
[
'bottom'
][
i
][
1
]
>
abs
(
bottom_top_x_axis
-
l_x_axis
)
+
dis_by_directions
[
'top'
][
i
][
1
]
):
top_or_bottom
=
dis_by_directions
[
'top'
][
i
]
top_or_bottom
=
dis_by_directions
[
'top'
][
i
]
else
:
else
:
top_or_bottom
=
dis_by_directions
[
'bottom'
][
i
]
top_or_bottom
=
dis_by_directions
[
'bottom'
][
i
]
...
@@ -798,9 +845,11 @@ class MagicModel:
...
@@ -798,9 +845,11 @@ class MagicModel:
return
ret
return
ret
def
get_imgs_v2
(
self
,
page_no
:
int
):
def
get_imgs_v2
(
self
,
page_no
:
int
):
with_captions
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
3
,
4
)
with_captions
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
3
,
4
,
PosRelationEnum
.
BOTTOM
)
with_footnotes
=
self
.
__tie_up_category_by_distance_v2
(
with_footnotes
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
3
,
CategoryId
.
ImageFootnote
page_no
,
3
,
CategoryId
.
ImageFootnote
,
PosRelationEnum
.
ALL
)
)
ret
=
[]
ret
=
[]
for
v
in
with_captions
:
for
v
in
with_captions
:
...
@@ -815,8 +864,12 @@ class MagicModel:
...
@@ -815,8 +864,12 @@ class MagicModel:
return
ret
return
ret
def
get_tables_v2
(
self
,
page_no
:
int
)
->
list
:
def
get_tables_v2
(
self
,
page_no
:
int
)
->
list
:
with_captions
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
5
,
6
)
with_captions
=
self
.
__tie_up_category_by_distance_v2
(
with_footnotes
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
5
,
7
)
page_no
,
5
,
6
,
PosRelationEnum
.
UP
)
with_footnotes
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
5
,
7
,
PosRelationEnum
.
ALL
)
ret
=
[]
ret
=
[]
for
v
in
with_captions
:
for
v
in
with_captions
:
record
=
{
record
=
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment