Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
49076f02
Commit
49076f02
authored
Apr 23, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix draw_span_bbox logic
parent
3457256f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
44 additions
and
16 deletions
+44
-16
magicpdf.py
magic_pdf/cli/magicpdf.py
+2
-1
draw_bbox.py
magic_pdf/libs/draw_bbox.py
+42
-15
No files found.
magic_pdf/cli/magicpdf.py
View file @
49076f02
...
@@ -28,7 +28,7 @@ import click
...
@@ -28,7 +28,7 @@ import click
from
loguru
import
logger
from
loguru
import
logger
from
pathlib
import
Path
from
pathlib
import
Path
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
,
draw_span_bbox
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
...
@@ -73,6 +73,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
...
@@ -73,6 +73,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
pipe
.
pipe_parse
()
pipe
.
pipe_parse
()
pdf_info
=
pipe
.
pdf_mid_data
[
'pdf_info'
]
pdf_info
=
pipe
.
pdf_mid_data
[
'pdf_info'
]
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
md_content
=
pipe
.
pipe_mk_markdown
()
md_content
=
pipe
.
pipe_mk_markdown
()
#part_file_name = datetime.now().strftime("%H-%M-%S")
#part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer
.
write
(
md_writer
.
write
(
...
...
magic_pdf/libs/draw_bbox.py
View file @
49076f02
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
def
draw_bbox_without_number
(
i
,
bbox_list
,
page
,
rgb_config
,
fill_config
):
def
draw_bbox_without_number
(
i
,
bbox_list
,
page
,
rgb_config
,
fill_config
):
...
@@ -58,32 +58,59 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
...
@@ -58,32 +58,59 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
# Save the PDF
# Save the PDF
pdf_docs
.
save
(
f
"{out_path}/layout.pdf"
)
pdf_docs
.
save
(
f
"{out_path}/layout.pdf"
)
def
draw_
text_bbox
(
pdf_info_dict
,
pdf_bytes
,
out_path
):
def
draw_
span_bbox
(
pdf_info
,
pdf_bytes
,
out_path
):
text_list
=
[]
text_list
=
[]
inline_equation_list
=
[]
inline_equation_list
=
[]
interline_equation_list
=
[]
interline_equation_list
=
[]
for
page
in
pdf_info_dict
.
values
():
image_list
=
[]
table_list
=
[]
for
page
in
pdf_info
:
page_text_list
=
[]
page_text_list
=
[]
page_inline_equation_list
=
[]
page_inline_equation_list
=
[]
page_interline_equation_list
=
[]
page_interline_equation_list
=
[]
page_image_list
=
[]
page_table_list
=
[]
for
block
in
page
[
'para_blocks'
]:
for
block
in
page
[
'para_blocks'
]:
for
line
in
block
[
'lines'
]:
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
InterlineEquation
]:
for
span
in
line
[
'spans'
]:
for
line
in
block
[
'lines'
]:
if
span
[
'type'
]
==
ContentType
.
Text
:
for
span
in
line
[
'spans'
]:
page_text_list
.
append
(
span
[
'bbox'
])
if
span
[
'type'
]
==
ContentType
.
Text
:
elif
span
[
'type'
]
==
ContentType
.
InlineEquation
:
page_text_list
.
append
(
span
[
'bbox'
])
page_inline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
InlineEquation
:
elif
span
[
'type'
]
==
ContentType
.
InterlineEquation
:
page_inline_equation_list
.
append
(
span
[
'bbox'
])
page_interline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
InterlineEquation
:
page_interline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
Image
:
page_image_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
Table
:
page_table_list
.
append
(
span
[
'bbox'
])
elif
block
[
'type'
]
in
[
BlockType
.
Image
,
BlockType
.
Table
]:
for
sub_block
in
block
[
"blocks"
]:
for
line
in
sub_block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Text
:
page_text_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
InlineEquation
:
page_inline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
InterlineEquation
:
page_interline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
Image
:
page_image_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
Table
:
page_table_list
.
append
(
span
[
'bbox'
])
text_list
.
append
(
page_text_list
)
text_list
.
append
(
page_text_list
)
inline_equation_list
.
append
(
page_inline_equation_list
)
inline_equation_list
.
append
(
page_inline_equation_list
)
interline_equation_list
.
append
(
page_interline_equation_list
)
interline_equation_list
.
append
(
page_interline_equation_list
)
image_list
.
append
(
page_image_list
)
table_list
.
append
(
page_table_list
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
for
i
,
page
in
enumerate
(
pdf_docs
):
for
i
,
page
in
enumerate
(
pdf_docs
):
# 获取当前页面的数据
# 获取当前页面的数据
draw_bbox_without_number
(
i
,
text_list
,
page
,
[
255
,
0
,
0
])
draw_bbox_without_number
(
i
,
text_list
,
page
,
[
255
,
0
,
0
],
False
)
draw_bbox_without_number
(
i
,
inline_equation_list
,
page
,
[
0
,
255
,
0
])
draw_bbox_without_number
(
i
,
inline_equation_list
,
page
,
[
0
,
255
,
0
],
False
)
draw_bbox_without_number
(
i
,
interline_equation_list
,
page
,
[
0
,
0
,
255
])
draw_bbox_without_number
(
i
,
interline_equation_list
,
page
,
[
0
,
0
,
255
],
False
)
draw_bbox_without_number
(
i
,
image_list
,
page
,
[
255
,
204
,
0
],
False
)
draw_bbox_without_number
(
i
,
table_list
,
page
,
[
204
,
0
,
255
],
False
)
# Save the PDF
# Save the PDF
pdf_docs
.
save
(
f
"{out_path}/
text
.pdf"
)
pdf_docs
.
save
(
f
"{out_path}/
spans
.pdf"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment