Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
60208b1b
Commit
60208b1b
authored
Apr 23, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix draw_layout_bbox logic
parent
d82d5d30
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
30 additions
and
14 deletions
+30
-14
magicpdf.py
magic_pdf/cli/magicpdf.py
+6
-1
draw_bbox.py
magic_pdf/libs/draw_bbox.py
+24
-13
No files found.
magic_pdf/cli/magicpdf.py
View file @
60208b1b
...
...
@@ -27,6 +27,7 @@ import click
from
loguru
import
logger
from
pathlib
import
Path
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
...
...
@@ -56,7 +57,7 @@ def prepare_env(pdf_file_name, method):
return
local_image_dir
,
local_md_dir
def
_do_parse
(
pdf_file_name
,
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
def
_do_parse
(
pdf_file_name
,
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
,
local_md_dir
):
if
parse_method
==
"auto"
:
pipe
=
UNIPipe
(
pdf_bytes
,
model_list
,
image_writer
,
image_dir
,
is_debug
=
True
)
elif
parse_method
==
"txt"
:
...
...
@@ -69,6 +70,8 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
pipe
.
pipe_classify
()
pipe
.
pipe_parse
()
pdf_info
=
pipe
.
pdf_mid_data
[
'pdf_info'
]
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
md_content
=
pipe
.
pipe_mk_markdown
()
#part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer
.
write
(
...
...
@@ -144,6 +147,7 @@ def json_command(json, method):
local_image_rw
,
local_md_rw
,
os
.
path
.
basename
(
local_image_dir
),
local_md_dir
)
...
...
@@ -185,6 +189,7 @@ def pdf_command(pdf, model, method):
local_image_rw
,
local_md_rw
,
os
.
path
.
basename
(
local_image_dir
),
local_md_dir
)
...
...
magic_pdf/libs/draw_bbox.py
View file @
60208b1b
...
...
@@ -2,7 +2,7 @@ from magic_pdf.libs.commons import fitz # PyMuPDF
from
magic_pdf.libs.ocr_content_type
import
ContentType
def
draw_bbox_without_number
(
i
,
bbox_list
,
page
,
rgb_config
):
def
draw_bbox_without_number
(
i
,
bbox_list
,
page
,
rgb_config
,
fill_config
):
new_rgb
=
[]
for
item
in
rgb_config
:
item
=
float
(
item
)
/
255
...
...
@@ -11,10 +11,13 @@ def draw_bbox_without_number(i, bbox_list, page, rgb_config):
for
bbox
in
page_data
:
x0
,
y0
,
x1
,
y1
=
bbox
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
if
fill_config
:
page
.
draw_rect
(
rect_coords
,
color
=
None
,
fill
=
new_rgb
,
fill_opacity
=
0.3
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
else
:
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
fill_opacity
=
1
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
def
draw_bbox_with_number
(
i
,
bbox_list
,
page
,
rgb_config
):
def
draw_bbox_with_number
(
i
,
bbox_list
,
page
,
rgb_config
,
fill_config
):
new_rgb
=
[]
for
item
in
rgb_config
:
item
=
float
(
item
)
/
255
...
...
@@ -23,27 +26,35 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config):
for
j
,
bbox
in
enumerate
(
page_data
):
x0
,
y0
,
x1
,
y1
=
bbox
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
page
.
insert_text
((
x0
,
y0
),
str
(
j
+
1
),
fontsize
=
10
,
color
=
new_rgb
)
# Insert the index at the top left corner of the rectangle
if
fill_config
:
page
.
draw_rect
(
rect_coords
,
color
=
None
,
fill
=
new_rgb
,
fill_opacity
=
0.3
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
else
:
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
fill_opacity
=
1
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
page
.
insert_text
((
x0
,
y0
+
10
),
str
(
j
+
1
),
fontsize
=
10
,
color
=
new_rgb
)
# Insert the index at the top left corner of the rectangle
def
draw_layout_bbox
(
pdf_info
_dict
,
pdf_bytes
,
out_path
):
def
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
out_path
):
layout_bbox_list
=
[]
blocks_bbox_list
=
[]
dropped_bbox_list
=
[]
for
page
in
pdf_info
_dict
.
values
()
:
for
page
in
pdf_info
:
page_layout_list
=
[]
page_dropped_list
=
[]
page_blocks_bbox_list
=
[]
for
layout
in
page
[
'layout_bboxes'
]:
page_layout_list
.
append
(
layout
[
'layout_bbox'
])
layout_bbox_list
.
append
(
page_layout_list
)
for
drop_tag
,
dropped_bboxes
in
page
[
'droped_bboxes'
]
.
items
():
for
dropped_bbox
in
dropped_bboxes
:
page_dropped_list
.
append
(
dropped_bbox
)
for
dropped_bbox
in
page
[
'discarded_blocks'
]:
page_dropped_list
.
append
(
dropped_bbox
[
'bbox'
])
dropped_bbox_list
.
append
(
page_dropped_list
)
for
block
in
page
[
'para_blocks'
]:
page_blocks_bbox_list
.
append
(
block
[
'bbox'
])
blocks_bbox_list
.
append
(
page_blocks_bbox_list
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
for
i
,
page
in
enumerate
(
pdf_docs
):
draw_bbox_with_number
(
i
,
layout_bbox_list
,
page
,
[
255
,
0
,
0
])
draw_bbox_without_number
(
i
,
dropped_bbox_list
,
page
,
[
0
,
255
,
0
])
draw_bbox_with_number
(
i
,
layout_bbox_list
,
page
,
[
255
,
0
,
0
],
False
)
draw_bbox_without_number
(
i
,
dropped_bbox_list
,
page
,
[
0
,
255
,
0
],
True
)
draw_bbox_without_number
(
i
,
blocks_bbox_list
,
page
,
[
0
,
0
,
255
],
True
)
# Save the PDF
pdf_docs
.
save
(
f
"{out_path}/layout.pdf"
)
...
...
@@ -55,7 +66,7 @@ def draw_text_bbox(pdf_info_dict, pdf_bytes, out_path):
page_text_list
=
[]
page_inline_equation_list
=
[]
page_interline_equation_list
=
[]
for
block
in
page
[
'p
reproc
_blocks'
]:
for
block
in
page
[
'p
ara
_blocks'
]:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Text
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment