Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
c5b27057
Commit
c5b27057
authored
Apr 16, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
切图逻辑修复
parent
d438b97a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
19 additions
and
5 deletions
+19
-5
pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+0
-5
cut_image.py
magic_pdf/pre_proc/cut_image.py
+19
-0
No files found.
magic_pdf/libs/pdf_image_tools.py
View file @
c5b27057
from
loguru
import
logger
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.commons
import
fitz
from
magic_pdf.libs.commons
import
fitz
...
@@ -20,10 +19,6 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
...
@@ -20,10 +19,6 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
# 新版本生成平铺路径
# 新版本生成平铺路径
img_hash256_path
=
f
"{compute_sha256(img_path)}.jpg"
img_hash256_path
=
f
"{compute_sha256(img_path)}.jpg"
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"image_bboxes: 错误的box, {bbox}"
)
return
img_hash256_path
# 将坐标转换为fitz.Rect对象
# 将坐标转换为fitz.Rect对象
rect
=
fitz
.
Rect
(
*
bbox
)
rect
=
fitz
.
Rect
(
*
bbox
)
# 配置缩放倍数为3倍
# 配置缩放倍数为3倍
...
...
magic_pdf/pre_proc/cut_image.py
View file @
c5b27057
from
loguru
import
logger
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.pdf_image_tools
import
cut_image
from
magic_pdf.libs.pdf_image_tools
import
cut_image
...
@@ -10,9 +12,13 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
...
@@ -10,9 +12,13 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
for
span
in
spans
:
for
span
in
spans
:
span_type
=
span
[
'type'
]
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Image
:
if
span_type
==
ContentType
.
Image
:
if
not
check_img_bbox
(
span
[
'bbox'
]):
continue
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
imageWriter
=
imageWriter
)
imageWriter
=
imageWriter
)
elif
span_type
==
ContentType
.
Table
:
elif
span_type
==
ContentType
.
Table
:
if
not
check_img_bbox
(
span
[
'bbox'
]):
continue
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
imageWriter
=
imageWriter
)
imageWriter
=
imageWriter
)
...
@@ -38,15 +44,28 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
...
@@ -38,15 +44,28 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
return
join_path
(
pdf_bytes_md5
,
type
)
return
join_path
(
pdf_bytes_md5
,
type
)
for
bbox
in
image_bboxes
:
for
bbox
in
image_bboxes
:
if
not
check_img_bbox
(
bbox
):
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
image_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
images_overlap_backup
:
for
bbox
in
images_overlap_backup
:
if
not
check_img_bbox
(
bbox
):
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_backup_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
image_backup_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
table_bboxes
:
for
bbox
in
table_bboxes
:
if
not
check_img_bbox
(
bbox
):
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
def
check_img_bbox
(
bbox
)
->
bool
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"image_bboxes: 错误的box, {bbox}"
)
return
False
return
True
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment