Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
fa3475a4
Unverified
Commit
fa3475a4
authored
Aug 09, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Aug 09, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #386 from myhloli/master
feat(draw_bbox): add model bbox drawing functionality
parents
e7b0f8be
c90ee891
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
86 additions
and
2 deletions
+86
-2
draw_bbox.py
magic_pdf/libs/draw_bbox.py
+66
-1
ocr_content_type.py
magic_pdf/libs/ocr_content_type.py
+14
-0
cli_dev.py
magic_pdf/tools/cli_dev.py
+2
-0
common.py
magic_pdf/tools/common.py
+4
-1
No files found.
magic_pdf/libs/draw_bbox.py
View file @
fa3475a4
from
magic_pdf.libs.Constants
import
CROSS_PAGE
from
magic_pdf.libs.Constants
import
CROSS_PAGE
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
,
CategoryId
from
magic_pdf.model.magic_model
import
MagicModel
def
draw_bbox_without_number
(
i
,
bbox_list
,
page
,
rgb_config
,
fill_config
):
def
draw_bbox_without_number
(
i
,
bbox_list
,
page
,
rgb_config
,
fill_config
):
...
@@ -225,3 +226,67 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
...
@@ -225,3 +226,67 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
# Save the PDF
# Save the PDF
pdf_docs
.
save
(
f
"{out_path}/spans.pdf"
)
pdf_docs
.
save
(
f
"{out_path}/spans.pdf"
)
def
drow_model_bbox
(
model_list
:
list
,
pdf_bytes
,
out_path
):
dropped_bbox_list
=
[]
tables_body_list
,
tables_caption_list
,
tables_footnote_list
=
[],
[],
[]
imgs_body_list
,
imgs_caption_list
=
[],
[]
titles_list
=
[]
texts_list
=
[]
interequations_list
=
[]
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
magic_model
=
MagicModel
(
model_list
,
pdf_docs
)
for
i
in
range
(
len
(
model_list
)):
page_dropped_list
=
[]
tables_body
,
tables_caption
,
tables_footnote
=
[],
[],
[]
imgs_body
,
imgs_caption
=
[],
[]
titles
=
[]
texts
=
[]
interequations
=
[]
page_info
=
magic_model
.
get_model_list
(
i
)
layout_dets
=
page_info
[
"layout_dets"
]
for
layout_det
in
layout_dets
:
bbox
=
layout_det
[
"bbox"
]
if
layout_det
[
"category_id"
]
==
CategoryId
.
Text
:
texts
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
Title
:
titles
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
TableBody
:
tables_body
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
TableCaption
:
tables_caption
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
TableFootnote
:
tables_footnote
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
ImageBody
:
imgs_body
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
ImageCaption
:
imgs_caption
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
InterlineEquation_YOLO
:
interequations
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
Abandon
:
page_dropped_list
.
append
(
bbox
)
tables_body_list
.
append
(
tables_body
)
tables_caption_list
.
append
(
tables_caption
)
tables_footnote_list
.
append
(
tables_footnote
)
imgs_body_list
.
append
(
imgs_body
)
imgs_caption_list
.
append
(
imgs_caption
)
titles_list
.
append
(
titles
)
texts_list
.
append
(
texts
)
interequations_list
.
append
(
interequations
)
dropped_bbox_list
.
append
(
page_dropped_list
)
for
i
,
page
in
enumerate
(
pdf_docs
):
draw_bbox_with_number
(
i
,
dropped_bbox_list
,
page
,
[
158
,
158
,
158
],
True
)
# color !
draw_bbox_with_number
(
i
,
tables_body_list
,
page
,
[
204
,
204
,
0
],
True
)
draw_bbox_with_number
(
i
,
tables_caption_list
,
page
,
[
255
,
255
,
102
],
True
)
draw_bbox_with_number
(
i
,
tables_footnote_list
,
page
,
[
229
,
255
,
204
],
True
)
draw_bbox_with_number
(
i
,
imgs_body_list
,
page
,
[
153
,
255
,
51
],
True
)
draw_bbox_with_number
(
i
,
imgs_caption_list
,
page
,
[
102
,
178
,
255
],
True
)
draw_bbox_with_number
(
i
,
titles_list
,
page
,
[
102
,
102
,
255
],
True
)
draw_bbox_with_number
(
i
,
texts_list
,
page
,
[
153
,
0
,
76
],
True
)
draw_bbox_with_number
(
i
,
interequations_list
,
page
,
[
0
,
255
,
0
],
True
)
# Save the PDF
pdf_docs
.
save
(
f
"{out_path}/model.pdf"
)
\ No newline at end of file
magic_pdf/libs/ocr_content_type.py
View file @
fa3475a4
...
@@ -19,3 +19,17 @@ class BlockType:
...
@@ -19,3 +19,17 @@ class BlockType:
Footnote
=
"footnote"
Footnote
=
"footnote"
Discarded
=
"discarded"
Discarded
=
"discarded"
class
CategoryId
:
Title
=
0
Text
=
1
Abandon
=
2
ImageBody
=
3
ImageCaption
=
4
TableBody
=
5
TableCaption
=
6
TableFootnote
=
7
InterlineEquation_Layout
=
8
InlineEquation
=
13
InterlineEquation_YOLO
=
14
OcrText
=
15
magic_pdf/tools/cli_dev.py
View file @
fa3475a4
...
@@ -94,6 +94,7 @@ def jsonl(jsonl, method, output_dir):
...
@@ -94,6 +94,7 @@ def jsonl(jsonl, method, output_dir):
jso
[
"doc_layout_result"
],
jso
[
"doc_layout_result"
],
method
,
method
,
f_dump_content_list
=
True
,
f_dump_content_list
=
True
,
f_draw_model_bbox
=
True
,
)
)
...
@@ -146,6 +147,7 @@ def pdf(pdf, json_data, output_dir, method):
...
@@ -146,6 +147,7 @@ def pdf(pdf, json_data, output_dir, method):
model_json_list
,
model_json_list
,
method
,
method
,
f_dump_content_list
=
True
,
f_dump_content_list
=
True
,
f_draw_model_bbox
=
True
,
)
)
...
...
magic_pdf/tools/common.py
View file @
fa3475a4
...
@@ -4,7 +4,7 @@ import copy
...
@@ -4,7 +4,7 @@ import copy
import
click
import
click
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.MakeContentConfig
import
DropMode
,
MakeMode
from
magic_pdf.libs.MakeContentConfig
import
DropMode
,
MakeMode
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
,
draw_span_bbox
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
,
draw_span_bbox
,
drow_model_bbox
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
...
@@ -37,6 +37,7 @@ def do_parse(
...
@@ -37,6 +37,7 @@ def do_parse(
f_dump_orig_pdf
=
True
,
f_dump_orig_pdf
=
True
,
f_dump_content_list
=
False
,
f_dump_content_list
=
False
,
f_make_md_mode
=
MakeMode
.
MM_MD
,
f_make_md_mode
=
MakeMode
.
MM_MD
,
f_draw_model_bbox
=
False
,
):
):
orig_model_list
=
copy
.
deepcopy
(
model_list
)
orig_model_list
=
copy
.
deepcopy
(
model_list
)
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
...
@@ -73,6 +74,8 @@ def do_parse(
...
@@ -73,6 +74,8 @@ def do_parse(
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
if
f_draw_span_bbox
:
if
f_draw_span_bbox
:
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
if
f_draw_model_bbox
:
drow_model_bbox
(
orig_model_list
,
pdf_bytes
,
local_md_dir
)
md_content
=
pipe
.
pipe_mk_markdown
(
md_content
=
pipe
.
pipe_mk_markdown
(
image_dir
,
drop_mode
=
DropMode
.
NONE
,
md_make_mode
=
f_make_md_mode
image_dir
,
drop_mode
=
DropMode
.
NONE
,
md_make_mode
=
f_make_md_mode
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment