Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
1d16c319
Unverified
Commit
1d16c319
authored
Apr 23, 2024
by
myhloli
Committed by
GitHub
Apr 23, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #63 from icecraft/feat/add_draw_method
feat: draw block based on block_type
parents
fc234831
4aa48329
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
134 additions
and
40 deletions
+134
-40
draw_bbox.py
magic_pdf/libs/draw_bbox.py
+134
-40
No files found.
magic_pdf/libs/draw_bbox.py
View file @
1d16c319
...
...
@@ -12,9 +12,23 @@ def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
x0
,
y0
,
x1
,
y1
=
bbox
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
if
fill_config
:
page
.
draw_rect
(
rect_coords
,
color
=
None
,
fill
=
new_rgb
,
fill_opacity
=
0.3
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
None
,
fill
=
new_rgb
,
fill_opacity
=
0.3
,
width
=
0.5
,
overlay
=
True
,
)
# Draw the rectangle
else
:
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
fill_opacity
=
1
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
fill_opacity
=
1
,
width
=
0.5
,
overlay
=
True
,
)
# Draw the rectangle
def
draw_bbox_with_number
(
i
,
bbox_list
,
page
,
rgb_config
,
fill_config
):
...
...
@@ -27,37 +41,113 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
x0
,
y0
,
x1
,
y1
=
bbox
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
if
fill_config
:
page
.
draw_rect
(
rect_coords
,
color
=
None
,
fill
=
new_rgb
,
fill_opacity
=
0.3
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
None
,
fill
=
new_rgb
,
fill_opacity
=
0.3
,
width
=
0.5
,
overlay
=
True
,
)
# Draw the rectangle
else
:
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
fill_opacity
=
1
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
page
.
insert_text
((
x0
,
y0
+
10
),
str
(
j
+
1
),
fontsize
=
10
,
color
=
new_rgb
)
# Insert the index at the top left corner of the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
fill_opacity
=
1
,
width
=
0.5
,
overlay
=
True
,
)
# Draw the rectangle
page
.
insert_text
(
(
x0
,
y0
+
10
),
str
(
j
+
1
),
fontsize
=
10
,
color
=
new_rgb
)
# Insert the index at the top left corner of the rectangle
def
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
out_path
):
layout_bbox_list
=
[]
blocks_bbox_list
=
[]
dropped_bbox_list
=
[]
tables_list
,
tables_body_list
,
tables_caption_list
,
tables_footnote_list
=
(
[],
[],
[],
[],
)
imgs_list
,
imgs_body_list
,
imgs_caption_list
=
[],
[],
[]
titles_list
=
[]
texts_list
=
[]
interequations_list
=
[]
for
page
in
pdf_info
:
page_layout_list
=
[]
page_dropped_list
=
[]
page_blocks_bbox_list
=
[]
for
layout
in
page
[
'layout_bboxes'
]:
page_layout_list
.
append
(
layout
[
'layout_bbox'
])
tables
,
tables_body
,
tables_caption
,
tables_footnote
=
[],
[],
[],
[]
imgs
,
imgs_body
,
imgs_caption
=
[],
[],
[]
titles
=
[]
texts
=
[]
interequations
=
[]
for
layout
in
page
[
"layout_bboxes"
]:
page_layout_list
.
append
(
layout
[
"layout_bbox"
])
layout_bbox_list
.
append
(
page_layout_list
)
for
dropped_bbox
in
page
[
'discarded_blocks'
]:
page_dropped_list
.
append
(
dropped_bbox
[
'bbox'
])
for
dropped_bbox
in
page
[
"discarded_blocks"
]:
page_dropped_list
.
append
(
dropped_bbox
[
"bbox"
])
dropped_bbox_list
.
append
(
page_dropped_list
)
for
block
in
page
[
'para_blocks'
]:
page_blocks_bbox_list
.
append
(
block
[
'bbox'
])
blocks_bbox_list
.
append
(
page_blocks_bbox_list
)
for
block
in
page
[
"para_blocks"
]:
bbox
=
block
[
"bbox"
]
if
block
[
"type"
]
==
BlockType
.
Table
:
tables
.
append
(
bbox
)
for
nested_block
in
block
[
"blocks"
]:
bbox
=
nested_block
[
"bbox"
]
if
nested_block
[
"type"
]
==
BlockType
.
TableBody
:
tables_body
.
append
(
bbox
)
elif
nested_block
[
"type"
]
==
BlockType
.
TableCaption
:
tables_caption
.
append
(
bbox
)
elif
nested_block
[
"type"
]
==
BlockType
.
TableFootnote
:
tables_footnote
.
append
(
bbox
)
elif
block
[
"type"
]
==
BlockType
.
Image
:
imgs
.
append
(
bbox
)
for
nested_block
in
block
[
"blocks"
]:
bbox
=
nested_block
[
"bbox"
]
if
nested_block
[
"type"
]
==
BlockType
.
ImageBody
:
imgs_body
.
append
(
bbox
)
elif
nested_block
[
"type"
]
==
BlockType
.
ImageCaption
:
imgs_caption
.
append
(
bbox
)
elif
block
[
"type"
]
==
BlockType
.
Title
:
titles
.
append
(
bbox
)
elif
block
[
"type"
]
==
BlockType
.
Text
:
texts
.
append
(
bbox
)
elif
block
[
"type"
]
==
BlockType
.
InterlineEquation
:
interequations
.
append
(
bbox
)
tables_list
.
append
(
tables
)
tables_body_list
.
append
(
tables_body
)
tables_caption_list
.
append
(
tables_caption
)
tables_footnote_list
.
append
(
tables_footnote
)
imgs_list
.
append
(
imgs
)
imgs_body_list
.
append
(
imgs_body
)
imgs_caption_list
.
append
(
imgs_caption
)
titles_list
.
append
(
titles
)
texts_list
.
append
(
texts
)
interequations_list
.
append
(
interequations
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
for
i
,
page
in
enumerate
(
pdf_docs
):
draw_bbox_with_number
(
i
,
layout_bbox_list
,
page
,
[
255
,
0
,
0
],
False
)
draw_bbox_without_number
(
i
,
dropped_bbox_list
,
page
,
[
0
,
255
,
0
],
True
)
draw_bbox_without_number
(
i
,
blocks_bbox_list
,
page
,
[
0
,
0
,
255
],
True
)
draw_bbox_without_number
(
i
,
tables_list
,
page
,
[
153
,
153
,
0
],
True
)
# color !
draw_bbox_without_number
(
i
,
tables_body_list
,
page
,
[
204
,
204
,
0
],
True
)
draw_bbox_without_number
(
i
,
tables_caption_list
,
page
,
[
255
,
255
,
102
],
True
)
draw_bbox_without_number
(
i
,
tables_footnote_list
,
page
,
[
229
,
255
,
204
],
True
)
draw_bbox_without_number
(
i
,
imgs_list
,
page
,
[
51
,
102
,
0
],
True
)
draw_bbox_without_number
(
i
,
imgs_body_list
,
page
,
[
153
,
255
,
51
],
True
)
draw_bbox_without_number
(
i
,
imgs_caption_list
,
page
,
[
102
,
178
,
255
],
True
)
draw_bbox_without_number
(
i
,
titles_list
,
page
,
[
102
,
102
,
255
],
True
)
draw_bbox_without_number
(
i
,
texts_list
,
page
,
[
153
,
0
,
76
],
True
)
draw_bbox_without_number
(
i
,
interequations_list
,
page
,
[
160
,
160
,
160
],
True
)
# Save the PDF
pdf_docs
.
save
(
f
"{out_path}/layout.pdf"
)
def
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
out_path
):
text_list
=
[]
inline_equation_list
=
[]
...
...
@@ -70,34 +160,38 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
page_interline_equation_list
=
[]
page_image_list
=
[]
page_table_list
=
[]
for
block
in
page
[
'para_blocks'
]:
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
InterlineEquation
]:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Text
:
page_text_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
InlineEquation
:
page_inline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
InterlineEquation
:
page_interline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
Image
:
page_image_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
Table
:
page_table_list
.
append
(
span
[
'bbox'
])
elif
block
[
'type'
]
in
[
BlockType
.
Image
,
BlockType
.
Table
]:
for
block
in
page
[
"para_blocks"
]:
if
block
[
"type"
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
InterlineEquation
,
]:
for
line
in
block
[
"lines"
]:
for
span
in
line
[
"spans"
]:
if
span
[
"type"
]
==
ContentType
.
Text
:
page_text_list
.
append
(
span
[
"bbox"
])
elif
span
[
"type"
]
==
ContentType
.
InlineEquation
:
page_inline_equation_list
.
append
(
span
[
"bbox"
])
elif
span
[
"type"
]
==
ContentType
.
InterlineEquation
:
page_interline_equation_list
.
append
(
span
[
"bbox"
])
elif
span
[
"type"
]
==
ContentType
.
Image
:
page_image_list
.
append
(
span
[
"bbox"
])
elif
span
[
"type"
]
==
ContentType
.
Table
:
page_table_list
.
append
(
span
[
"bbox"
])
elif
block
[
"type"
]
in
[
BlockType
.
Image
,
BlockType
.
Table
]:
for
sub_block
in
block
[
"blocks"
]:
for
line
in
sub_block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Text
:
page_text_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
InlineEquation
:
page_inline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
InterlineEquation
:
page_interline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
Image
:
page_image_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
Table
:
page_table_list
.
append
(
span
[
'bbox'
])
for
line
in
sub_block
[
"lines"
]:
for
span
in
line
[
"spans"
]:
if
span
[
"type"
]
==
ContentType
.
Text
:
page_text_list
.
append
(
span
[
"bbox"
])
elif
span
[
"type"
]
==
ContentType
.
InlineEquation
:
page_inline_equation_list
.
append
(
span
[
"bbox"
])
elif
span
[
"type"
]
==
ContentType
.
InterlineEquation
:
page_interline_equation_list
.
append
(
span
[
"bbox"
])
elif
span
[
"type"
]
==
ContentType
.
Image
:
page_image_list
.
append
(
span
[
"bbox"
])
elif
span
[
"type"
]
==
ContentType
.
Table
:
page_table_list
.
append
(
span
[
"bbox"
])
text_list
.
append
(
page_text_list
)
inline_equation_list
.
append
(
page_inline_equation_list
)
interline_equation_list
.
append
(
page_interline_equation_list
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment