Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
96d17cb0
Commit
96d17cb0
authored
Apr 28, 2024
by
许瑞
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix: using overlap area ratio to calculate box relation when build figure/caption relations
parent
683fa633
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
42 additions
and
28 deletions
+42
-28
magic_model.py
magic_pdf/model/magic_model.py
+42
-28
No files found.
magic_pdf/model/magic_model.py
View file @
96d17cb0
...
@@ -10,9 +10,16 @@ from magic_pdf.libs.ocr_content_type import ContentType
...
@@ -10,9 +10,16 @@ from magic_pdf.libs.ocr_content_type import ContentType
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.libs.math
import
float_gt
from
magic_pdf.libs.math
import
float_gt
from
magic_pdf.libs.boxbase
import
_is_in
,
bbox_relative_pos
,
bbox_distance
from
magic_pdf.libs.boxbase
import
(
_is_in
,
bbox_relative_pos
,
bbox_distance
,
_is_part_overlap
,
calculate_overlap_area_in_bbox1_area_ratio
,
)
from
magic_pdf.libs.ModelBlockTypeEnum
import
ModelBlockTypeEnum
from
magic_pdf.libs.ModelBlockTypeEnum
import
ModelBlockTypeEnum
CAPATION_OVERLAP_AREA_RATIO
=
0.6
class
MagicModel
:
class
MagicModel
:
"""
"""
...
@@ -80,7 +87,7 @@ class MagicModel:
...
@@ -80,7 +87,7 @@ class MagicModel:
假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object),每个 object 只能属于一个 subject
假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object),每个 object 只能属于一个 subject
"""
"""
ret
=
[]
ret
=
[]
MAX_DIS_OF_POINT
=
10
**
9
+
7
MAX_DIS_OF_POINT
=
10
**
9
+
7
subjects
=
self
.
__reduct_overlap
(
subjects
=
self
.
__reduct_overlap
(
list
(
list
(
...
@@ -238,7 +245,7 @@ class MagicModel:
...
@@ -238,7 +245,7 @@ class MagicModel:
for
bbox
in
caption_poses
:
for
bbox
in
caption_poses
:
embed_arr
=
[]
embed_arr
=
[]
for
idx
in
seen
:
for
idx
in
seen
:
if
_is_in
(
all_bboxes
[
idx
][
"bbox"
],
bbox
)
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
all_bboxes
[
idx
][
"bbox"
],
bbox
)
>
CAPATION_OVERLAP_AREA_RATIO
:
embed_arr
.
append
(
idx
)
embed_arr
.
append
(
idx
)
if
len
(
embed_arr
)
>
0
:
if
len
(
embed_arr
)
>
0
:
...
@@ -258,7 +265,7 @@ class MagicModel:
...
@@ -258,7 +265,7 @@ class MagicModel:
caption_bbox
=
caption_poses
[
max_area_idx
]
caption_bbox
=
caption_poses
[
max_area_idx
]
for
j
in
seen
:
for
j
in
seen
:
if
_is_in
(
all_bboxes
[
j
][
"bbox"
],
caption_bbox
)
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
all_bboxes
[
j
][
"bbox"
],
caption_bbox
)
>
CAPATION_OVERLAP_AREA_RATIO
:
used
.
add
(
j
)
used
.
add
(
j
)
subject_object_relation_map
[
i
]
.
append
(
j
)
subject_object_relation_map
[
i
]
.
append
(
j
)
...
@@ -358,9 +365,15 @@ class MagicModel:
...
@@ -358,9 +365,15 @@ class MagicModel:
return
ret
return
ret
def
get_equations
(
self
,
page_no
:
int
)
->
list
:
# 有坐标,也有字
def
get_equations
(
self
,
page_no
:
int
)
->
list
:
# 有坐标,也有字
inline_equations
=
self
.
__get_blocks_by_type
(
ModelBlockTypeEnum
.
EMBEDDING
.
value
,
page_no
,
[
"latex"
])
inline_equations
=
self
.
__get_blocks_by_type
(
interline_equations
=
self
.
__get_blocks_by_type
(
ModelBlockTypeEnum
.
ISOLATED
.
value
,
page_no
,
[
"latex"
])
ModelBlockTypeEnum
.
EMBEDDING
.
value
,
page_no
,
[
"latex"
]
interline_equations_blocks
=
self
.
__get_blocks_by_type
(
ModelBlockTypeEnum
.
ISOLATE_FORMULA
.
value
,
page_no
)
)
interline_equations
=
self
.
__get_blocks_by_type
(
ModelBlockTypeEnum
.
ISOLATED
.
value
,
page_no
,
[
"latex"
]
)
interline_equations_blocks
=
self
.
__get_blocks_by_type
(
ModelBlockTypeEnum
.
ISOLATE_FORMULA
.
value
,
page_no
)
return
inline_equations
,
interline_equations
,
interline_equations_blocks
return
inline_equations
,
interline_equations
,
interline_equations_blocks
def
get_discarded
(
self
,
page_no
:
int
)
->
list
:
# 自研模型,只有坐标
def
get_discarded
(
self
,
page_no
:
int
)
->
list
:
# 自研模型,只有坐标
...
@@ -382,7 +395,7 @@ class MagicModel:
...
@@ -382,7 +395,7 @@ class MagicModel:
for
layout_det
in
layout_dets
:
for
layout_det
in
layout_dets
:
if
layout_det
[
"category_id"
]
==
"15"
:
if
layout_det
[
"category_id"
]
==
"15"
:
span
=
{
span
=
{
"bbox"
:
layout_det
[
'bbox'
],
"bbox"
:
layout_det
[
"bbox"
],
"content"
:
layout_det
[
"text"
],
"content"
:
layout_det
[
"text"
],
}
}
text_spans
.
append
(
span
)
text_spans
.
append
(
span
)
...
@@ -402,9 +415,7 @@ class MagicModel:
...
@@ -402,9 +415,7 @@ class MagicModel:
for
layout_det
in
layout_dets
:
for
layout_det
in
layout_dets
:
category_id
=
layout_det
[
"category_id"
]
category_id
=
layout_det
[
"category_id"
]
if
category_id
in
allow_category_id_list
:
if
category_id
in
allow_category_id_list
:
span
=
{
span
=
{
"bbox"
:
layout_det
[
"bbox"
]}
"bbox"
:
layout_det
[
'bbox'
]
}
if
category_id
==
3
:
if
category_id
==
3
:
span
[
"type"
]
=
ContentType
.
Image
span
[
"type"
]
=
ContentType
.
Image
elif
category_id
==
5
:
elif
category_id
==
5
:
...
@@ -429,7 +440,9 @@ class MagicModel:
...
@@ -429,7 +440,9 @@ class MagicModel:
page_h
=
page
.
rect
.
height
page_h
=
page
.
rect
.
height
return
page_w
,
page_h
return
page_w
,
page_h
def
__get_blocks_by_type
(
self
,
type
:
int
,
page_no
:
int
,
extra_col
:
list
[
str
]
=
[])
->
list
:
def
__get_blocks_by_type
(
self
,
type
:
int
,
page_no
:
int
,
extra_col
:
list
[
str
]
=
[]
)
->
list
:
blocks
=
[]
blocks
=
[]
for
page_dict
in
self
.
__model_list
:
for
page_dict
in
self
.
__model_list
:
layout_dets
=
page_dict
.
get
(
"layout_dets"
,
[])
layout_dets
=
page_dict
.
get
(
"layout_dets"
,
[])
...
@@ -442,14 +455,15 @@ class MagicModel:
...
@@ -442,14 +455,15 @@ class MagicModel:
bbox
=
item
.
get
(
"bbox"
,
None
)
bbox
=
item
.
get
(
"bbox"
,
None
)
if
category_id
==
type
:
if
category_id
==
type
:
block
=
{
block
=
{
"bbox"
:
bbox
}
"bbox"
:
bbox
}
for
col
in
extra_col
:
for
col
in
extra_col
:
block
[
col
]
=
item
.
get
(
col
,
None
)
block
[
col
]
=
item
.
get
(
col
,
None
)
blocks
.
append
(
block
)
blocks
.
append
(
block
)
return
blocks
return
blocks
def
get_model_list
(
self
,
page_no
):
return
self
.
__model_list
[
page_no
]
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
drw
=
DiskReaderWriter
(
r"D:/project/20231108code-clean"
)
drw
=
DiskReaderWriter
(
r"D:/project/20231108code-clean"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment