Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
5b2d81aa
Commit
5b2d81aa
authored
Apr 18, 2024
by
许瑞
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat: support get images and tables
parent
53a63316
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
369 additions
and
26 deletions
+369
-26
boxbase.py
magic_pdf/libs/boxbase.py
+40
-1
math.py
magic_pdf/libs/math.py
+5
-0
magic_model.py
magic_pdf/model/magic_model.py
+324
-25
No files found.
magic_pdf/libs/boxbase.py
View file @
5b2d81aa
from
loguru
import
logger
import
math
def
_is_in_or_part_overlap
(
box1
,
box2
)
->
bool
:
"""
...
...
@@ -332,3 +332,42 @@ def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
return
right_boxes
[
0
]
else
:
return
None
def
bbox_relative_pos
(
bbox1
,
bbox2
):
x1
,
y1
,
x1b
,
y1b
=
bbox1
x2
,
y2
,
x2b
,
y2b
=
bbox2
left
=
x2b
<
x1
right
=
x1b
<
x2
bottom
=
y2b
<
y1
top
=
y1b
<
y2
return
left
,
right
,
bottom
,
top
def
bbox_distance
(
bbox1
,
bbox2
):
def
dist
(
point1
,
point2
):
return
math
.
sqrt
((
point1
[
0
]
-
point2
[
0
])
**
2
+
(
point1
[
1
]
-
point2
[
1
])
**
2
)
x1
,
y1
,
x1b
,
y1b
=
bbox1
x2
,
y2
,
x2b
,
y2b
=
bbox2
left
,
right
,
bottom
,
top
=
bbox_relative_pos
(
bbox1
,
bbox2
)
if
top
and
left
:
return
dist
((
x1
,
y1b
),
(
x2b
,
y2
))
elif
left
and
bottom
:
return
dist
((
x1
,
y1
),
(
x2b
,
y2b
))
elif
bottom
and
right
:
return
dist
((
x1b
,
y1
),
(
x2
,
y2b
))
elif
right
and
top
:
return
dist
((
x1b
,
y1b
),
(
x2
,
y2
))
elif
left
:
return
x1
-
x2b
elif
right
:
return
x2
-
x1b
elif
bottom
:
return
y1
-
y2b
elif
top
:
return
y2
-
y1b
else
:
# rectangles intersect
return
0
\ No newline at end of file
magic_pdf/libs/math.py
0 → 100644
View file @
5b2d81aa
def
float_gt
(
a
,
b
):
if
0.0001
>=
abs
(
a
-
b
):
return
False
return
a
>
b
\ No newline at end of file
magic_pdf/model/magic_model.py
View file @
5b2d81aa
import
json
import
math
from
magic_pdf.libs.commons
import
fitz
from
loguru
import
logger
...
...
@@ -7,18 +8,22 @@ from magic_pdf.libs.commons import join_path
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.libs.math
import
float_gt
from
magic_pdf.libs.boxbase
import
_is_in
,
bbox_relative_pos
,
bbox_distance
class
MagicModel
()
:
class
MagicModel
:
"""
每个函数没有得到元素的时候返回空list
"""
def
__fix_axis
(
self
):
for
model_page_info
in
self
.
__model_list
:
page_no
=
model_page_info
[
'page_info'
][
'page_no'
]
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
model_page_info
,
self
.
__docs
[
page_no
])
page_no
=
model_page_info
[
"page_info"
][
"page_no"
]
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
model_page_info
,
self
.
__docs
[
page_no
]
)
layout_dets
=
model_page_info
[
"layout_dets"
]
for
layout_det
in
layout_dets
:
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
layout_det
[
"poly"
]
...
...
@@ -35,20 +40,301 @@ class MagicModel():
self
.
__docs
=
docs
self
.
__fix_axis
()
def
get_imgs
(
self
,
page_no
:
int
):
# @许瑞
def
__reduct_overlap
(
self
,
bboxes
):
N
=
len
(
bboxes
)
keep
=
[
True
]
*
N
for
i
in
range
(
N
):
for
j
in
range
(
N
):
if
i
==
j
:
continue
if
_is_in
(
bboxes
[
i
],
bboxes
[
j
]):
keep
[
i
]
=
False
return
[
bboxes
[
i
]
for
i
in
range
(
N
)
if
keep
[
i
]]
def
__tie_up_category_by_distance
(
self
,
page_no
,
subject_category_id
,
object_category_id
):
"""
假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object),每个 object 只能属于一个 subject
"""
ret
=
[]
MAX_DIS_OF_POINT
=
10
**
9
+
7
subjects
=
self
.
__reduct_overlap
(
list
(
map
(
lambda
x
:
x
[
"bbox"
],
filter
(
lambda
x
:
x
[
"category_id"
]
==
subject_category_id
,
self
.
__model_list
[
page_no
][
"layout_dets"
],
),
)
)
)
objects
=
self
.
__reduct_overlap
(
list
(
map
(
lambda
x
:
x
[
"bbox"
],
filter
(
lambda
x
:
x
[
"category_id"
]
==
object_category_id
,
self
.
__model_list
[
page_no
][
"layout_dets"
],
),
)
)
)
subject_object_relation_map
=
{}
subjects
.
sort
(
key
=
lambda
x
:
x
[
0
]
**
2
+
x
[
1
]
**
2
)
# get the distance !
all_bboxes
=
[]
for
v
in
subjects
:
all_bboxes
.
append
({
"category_id"
:
subject_category_id
,
"bbox"
:
v
})
for
v
in
objects
:
all_bboxes
.
append
({
"category_id"
:
object_category_id
,
"bbox"
:
v
})
N
=
len
(
all_bboxes
)
dis
=
[[
MAX_DIS_OF_POINT
]
*
N
for
_
in
range
(
N
)]
for
i
in
range
(
N
):
for
j
in
range
(
i
):
if
(
all_bboxes
[
i
][
"category_id"
]
==
subject_category_id
and
all_bboxes
[
j
][
"category_id"
]
==
subject_category_id
):
continue
dis
[
i
][
j
]
=
bbox_distance
(
all_bboxes
[
i
][
"bbox"
],
all_bboxes
[
j
][
"bbox"
])
dis
[
j
][
i
]
=
dis
[
i
][
j
]
used
=
set
()
for
i
in
range
(
N
):
# 求第 i 个 subject 所关联的 object
if
all_bboxes
[
i
][
"category_id"
]
!=
subject_category_id
:
continue
seen
=
set
()
candidates
=
[]
arr
=
[]
for
j
in
range
(
N
):
pos_flag_count
=
sum
(
list
(
map
(
lambda
x
:
1
if
x
else
0
,
bbox_relative_pos
(
all_bboxes
[
i
][
"bbox"
],
all_bboxes
[
j
][
"bbox"
]
),
)
)
)
if
pos_flag_count
>
1
:
continue
if
(
all_bboxes
[
j
][
"category_id"
]
!=
object_category_id
or
j
in
used
or
dis
[
i
][
j
]
==
MAX_DIS_OF_POINT
):
continue
arr
.
append
((
dis
[
i
][
j
],
j
))
arr
.
sort
(
key
=
lambda
x
:
x
[
0
])
if
len
(
arr
)
>
0
:
candidates
.
append
(
arr
[
0
][
1
])
seen
.
add
(
arr
[
0
][
1
])
# 已经获取初始种子
for
j
in
set
(
candidates
):
tmp
=
[]
for
k
in
range
(
i
+
1
,
N
):
pos_flag_count
=
sum
(
list
(
map
(
lambda
x
:
1
if
x
else
0
,
bbox_relative_pos
(
all_bboxes
[
j
][
"bbox"
],
all_bboxes
[
k
][
"bbox"
]
),
)
)
)
if
pos_flag_count
>
1
:
continue
image_block
=
{
if
(
all_bboxes
[
k
][
"category_id"
]
!=
object_category_id
or
k
in
used
or
k
in
seen
or
dis
[
j
][
k
]
==
MAX_DIS_OF_POINT
):
continue
is_nearest
=
True
for
l
in
range
(
i
+
1
,
N
):
if
l
in
(
j
,
k
)
or
l
in
used
or
l
in
seen
:
continue
}
image_block
[
'bbox'
]
=
[
x0
,
y0
,
x1
,
y1
]
# 计算出来
image_block
[
'img_body_bbox'
]
=
[
x0
,
y0
,
x1
,
y1
]
image_blcok
[
'img_caption_bbox'
]
=
[
x0
,
y0
,
x1
,
y1
]
# 如果没有就是None,但是保证key存在
image_blcok
[
'img_caption_text'
]
=
[
x0
,
y0
,
x1
,
y1
]
# 如果没有就是空字符串,但是保证key存在
if
not
float_gt
(
dis
[
l
][
k
],
dis
[
j
][
k
]):
is_nearest
=
False
break
return
[
image_block
,
]
if
is_nearest
:
tmp
.
append
(
k
)
seen
.
add
(
k
)
def
get_tables
(
self
,
page_no
:
int
)
->
list
:
# 3个坐标, caption, table主体,table-note
pass
# 许瑞, 结构和image一样
candidates
=
tmp
if
len
(
candidates
)
==
0
:
break
# 已经获取到某个 figure 下所有的最靠近的 captions,以及最靠近这些 captions 的 captions 。
# 先扩一下 bbox,
x0s
=
[
all_bboxes
[
idx
][
"bbox"
][
0
]
for
idx
in
seen
]
+
[
all_bboxes
[
i
][
"bbox"
][
0
]
]
y0s
=
[
all_bboxes
[
idx
][
"bbox"
][
1
]
for
idx
in
seen
]
+
[
all_bboxes
[
i
][
"bbox"
][
1
]
]
x1s
=
[
all_bboxes
[
idx
][
"bbox"
][
2
]
for
idx
in
seen
]
+
[
all_bboxes
[
i
][
"bbox"
][
2
]
]
y1s
=
[
all_bboxes
[
idx
][
"bbox"
][
3
]
for
idx
in
seen
]
+
[
all_bboxes
[
i
][
"bbox"
][
3
]
]
ox0
,
oy0
,
ox1
,
oy1
=
min
(
x0s
),
min
(
y0s
),
max
(
x1s
),
max
(
y1s
)
ix0
,
iy0
,
ix1
,
iy1
=
all_bboxes
[
i
][
"bbox"
]
# 分成了 4 个截取空间,需要计算落在每个截取空间下 objects 合并后占据的矩形面积
caption_poses
=
[
[
ox0
,
oy0
,
ix0
,
oy1
],
[
ox0
,
oy0
,
ox1
,
iy0
],
[
ox0
,
iy1
,
ox1
,
oy1
],
[
ix1
,
oy0
,
ox1
,
oy1
],
]
caption_areas
=
[]
for
bbox
in
caption_poses
:
embed_arr
=
[]
for
idx
in
seen
:
if
_is_in
(
all_bboxes
[
idx
][
"bbox"
],
bbox
):
embed_arr
.
append
(
idx
)
if
len
(
embed_arr
)
>
0
:
embed_x0
=
min
([
all_bboxes
[
idx
][
"bbox"
][
0
]
for
idx
in
embed_arr
])
embed_y0
=
min
([
all_bboxes
[
idx
][
"bbox"
][
1
]
for
idx
in
embed_arr
])
embed_x1
=
max
([
all_bboxes
[
idx
][
"bbox"
][
2
]
for
idx
in
embed_arr
])
embed_y1
=
max
([
all_bboxes
[
idx
][
"bbox"
][
3
]
for
idx
in
embed_arr
])
caption_areas
.
append
(
int
(
abs
(
embed_x1
-
embed_x0
)
*
abs
(
embed_y1
-
embed_y0
))
)
else
:
caption_areas
.
append
(
0
)
subject_object_relation_map
[
i
]
=
[]
if
max
(
caption_areas
)
>
0
:
max_area_idx
=
caption_areas
.
index
(
max
(
caption_areas
))
caption_bbox
=
caption_poses
[
max_area_idx
]
for
j
in
seen
:
if
_is_in
(
all_bboxes
[
j
][
"bbox"
],
caption_bbox
):
used
.
add
(
j
)
subject_object_relation_map
[
i
]
.
append
(
j
)
for
i
in
sorted
(
subject_object_relation_map
.
keys
()):
result
=
{
"subject_body"
:
all_bboxes
[
i
][
"bbox"
],
"all"
:
all_bboxes
[
i
][
"bbox"
],
}
if
len
(
subject_object_relation_map
[
i
])
>
0
:
x0
=
min
(
[
all_bboxes
[
j
][
"bbox"
][
0
]
for
j
in
subject_object_relation_map
[
i
]]
)
y0
=
min
(
[
all_bboxes
[
j
][
"bbox"
][
1
]
for
j
in
subject_object_relation_map
[
i
]]
)
x1
=
max
(
[
all_bboxes
[
j
][
"bbox"
][
2
]
for
j
in
subject_object_relation_map
[
i
]]
)
y1
=
max
(
[
all_bboxes
[
j
][
"bbox"
][
3
]
for
j
in
subject_object_relation_map
[
i
]]
)
result
[
"object_body"
]
=
[
x0
,
y0
,
x1
,
y1
]
result
[
"all"
]
=
[
min
(
x0
,
all_bboxes
[
i
][
"bbox"
][
0
]),
min
(
y0
,
all_bboxes
[
i
][
"bbox"
][
1
]),
max
(
x1
,
all_bboxes
[
i
][
"bbox"
][
2
]),
max
(
y1
,
all_bboxes
[
i
][
"bbox"
][
3
]),
]
ret
.
append
(
result
)
total_subject_object_dis
=
0
# 计算已经配对的 distance 距离
for
i
in
subject_object_relation_map
.
keys
():
for
j
in
subject_object_relation_map
[
i
]:
total_subject_object_dis
+=
bbox_distance
(
all_bboxes
[
i
][
"bbox"
],
all_bboxes
[
j
][
"bbox"
]
)
# 计算未匹配的 subject 和 object 的距离(非精确版)
with_caption_subject
=
set
(
[
key
for
key
in
subject_object_relation_map
.
keys
()
if
len
(
subject_object_relation_map
[
i
])
>
0
]
)
for
i
in
range
(
N
):
if
all_bboxes
[
i
][
"category_id"
]
!=
object_category_id
or
i
in
used
:
continue
candidates
=
[]
for
j
in
range
(
N
):
if
(
all_bboxes
[
j
][
"category_id"
]
!=
subject_category_id
or
j
in
with_caption_subject
):
continue
candidates
.
append
((
dis
[
i
][
j
],
j
))
if
len
(
candidates
)
>
0
:
candidates
.
sort
(
key
=
lambda
x
:
x
[
0
])
total_subject_object_dis
+=
candidates
[
0
][
1
]
with_caption_subject
.
add
(
j
)
return
ret
,
total_subject_object_dis
def
get_imgs
(
self
,
page_no
:
int
):
# @许瑞
records
,
_
=
self
.
__tie_up_category_by_distance
(
page_no
,
3
,
4
)
return
[
{
"bbox"
:
record
[
"all"
],
"img_body_bbox"
:
record
[
"subject_body"
],
"img_caption_bbox"
:
record
.
get
(
"object_body"
,
None
),
}
for
record
in
records
]
def
get_tables
(
self
,
page_no
:
int
)
->
list
:
# 3个坐标, caption, table主体,table-note
with_captions
,
_
=
self
.
__tie_up_category_by_distance
(
page_no
,
5
,
6
)
with_footnotes
,
_
=
self
.
__tie_up_category_by_distance
(
page_no
,
5
,
7
)
ret
=
[]
N
,
M
=
len
(
with_captions
),
len
(
with_footnotes
)
assert
N
==
M
for
i
in
range
(
N
):
record
=
{
"table_caption_bbox"
:
with_captions
[
i
]
.
get
(
"object_body"
,
None
),
"table_body_bbox"
:
with_captions
[
i
][
"subject_body"
],
"table_footnote_bbox"
:
with_footnotes
[
i
]
.
get
(
"object_body"
,
None
),
}
x0
=
min
(
with_captions
[
i
][
"all"
][
0
],
with_footnotes
[
i
][
"all"
][
0
])
y0
=
min
(
with_captions
[
i
][
"all"
][
1
],
with_footnotes
[
i
][
"all"
][
1
])
x1
=
max
(
with_captions
[
i
][
"all"
][
2
],
with_footnotes
[
i
][
"all"
][
2
])
y1
=
max
(
with_captions
[
i
][
"all"
][
3
],
with_footnotes
[
i
][
"all"
][
3
])
record
[
"bbox"
]
=
[
x0
,
y0
,
x1
,
y1
]
ret
.
append
(
record
)
return
ret
def
get_equations
(
self
,
page_no
:
int
)
->
list
:
# 有坐标,也有字
return
inline_equations
,
interline_equations
# @凯文
...
...
@@ -69,15 +355,28 @@ class MagicModel():
pass
# @小蒙
if
__name__
==
'__main__'
:
if
__name__
==
"__main__"
:
drw
=
DiskReaderWriter
(
r"D:/project/20231108code-clean"
)
pdf_file_path
=
r"linshixuqiu\19983-00.pdf"
model_file_path
=
r"linshixuqiu\19983-00_new.json"
pdf_bytes
=
drw
.
read
(
pdf_file_path
,
AbsReaderWriter
.
MODE_BIN
)
model_json_txt
=
drw
.
read
(
model_file_path
,
AbsReaderWriter
.
MODE_TXT
)
model_list
=
json
.
loads
(
model_json_txt
)
write_path
=
r"D:\project\20231108code-clean\linshixuqiu\19983-00"
img_bucket_path
=
"imgs"
img_writer
=
DiskReaderWriter
(
join_path
(
write_path
,
img_bucket_path
))
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
magic_model
=
MagicModel
(
model_list
,
pdf_docs
)
if
0
:
pdf_file_path
=
r"linshixuqiu\19983-00.pdf"
model_file_path
=
r"linshixuqiu\19983-00_new.json"
pdf_bytes
=
drw
.
read
(
pdf_file_path
,
AbsReaderWriter
.
MODE_BIN
)
model_json_txt
=
drw
.
read
(
model_file_path
,
AbsReaderWriter
.
MODE_TXT
)
model_list
=
json
.
loads
(
model_json_txt
)
write_path
=
r"D:\project\20231108code-clean\linshixuqiu\19983-00"
img_bucket_path
=
"imgs"
img_writer
=
DiskReaderWriter
(
join_path
(
write_path
,
img_bucket_path
))
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
magic_model
=
MagicModel
(
model_list
,
pdf_docs
)
if
1
:
model_list
=
json
.
loads
(
drw
.
read
(
"/opt/data/pdf/20240418/j.chroma.2009.03.042.json"
)
)
pdf_bytes
=
drw
.
read
(
"/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf"
,
AbsReaderWriter
.
MODE_BIN
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
magic_model
=
MagicModel
(
model_list
,
pdf_docs
)
for
i
in
range
(
7
):
print
(
magic_model
.
get_imgs
(
i
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment