Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
51bb3b36
Commit
51bb3b36
authored
Apr 10, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
cut_image不报错公式图片,增加parse_union_pdf逻辑
parent
c45fdcc8
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
36 additions
and
27 deletions
+36
-27
pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+9
-24
spark_api.py
magic_pdf/spark/spark_api.py
+27
-3
No files found.
magic_pdf/libs/pdf_image_tools.py
View file @
51bb3b36
...
...
@@ -5,7 +5,7 @@ from magic_pdf.libs.commons import join_path
from
magic_pdf.libs.hash_utils
import
compute_sha256
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
,
upload
=
True
):
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
):
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
...
...
@@ -19,17 +19,16 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
# 新版本生成平铺路径
img_hash256_path
=
f
"{compute_sha256(img_path)}.jpg"
if
upload
:
# 将坐标转换为fitz.Rect对象
rect
=
fitz
.
Rect
(
*
bbox
)
# 配置缩放倍数为3倍
zoom
=
fitz
.
Matrix
(
3
,
3
)
# 截取图片
pix
=
page
.
get_pixmap
(
clip
=
rect
,
matrix
=
zoom
)
# 将坐标转换为fitz.Rect对象
rect
=
fitz
.
Rect
(
*
bbox
)
# 配置缩放倍数为3倍
zoom
=
fitz
.
Matrix
(
3
,
3
)
# 截取图片
pix
=
page
.
get_pixmap
(
clip
=
rect
,
matrix
=
zoom
)
byte_data
=
pix
.
tobytes
(
output
=
'jpeg'
,
jpg_quality
=
95
)
byte_data
=
pix
.
tobytes
(
output
=
'jpeg'
,
jpg_quality
=
95
)
imageWriter
.
write
(
data
=
byte_data
,
path
=
img_hash256_path
,
mode
=
"binary"
)
imageWriter
.
write
(
data
=
byte_data
,
path
=
img_hash256_path
,
mode
=
"binary"
)
return
img_hash256_path
...
...
@@ -74,18 +73,4 @@ def save_images_by_bboxes(page_num: int, page: fitz.Page, pdf_bytes_md5: str,
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
equation_inline_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"equation_inline_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
[:
4
],
page_num
,
page
,
return_path
(
"equations_inline"
),
imageWriter
,
upload
=
False
)
inline_eq_info
.
append
({
'bbox'
:
bbox
[:
4
],
"image_path"
:
image_path
,
"latex_text"
:
bbox
[
4
]})
for
bbox
in
equation_interline_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"equation_interline_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
[:
4
],
page_num
,
page
,
return_path
(
"equation_interline"
),
imageWriter
,
upload
=
False
)
interline_eq_info
.
append
({
"bbox"
:
bbox
[:
4
],
"image_path"
:
image_path
,
"latex_text"
:
bbox
[
4
]})
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
\ No newline at end of file
magic_pdf/spark/spark_api.py
View file @
51bb3b36
...
...
@@ -12,7 +12,7 @@
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
from
loguru
import
logger
from
magic_pdf.io
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
...
...
@@ -31,7 +31,6 @@ def parse_txt_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter
debug_mode
=
is_debug
,
)
return
pdf_info_dict
pass
def
parse_ocr_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
...
...
@@ -52,4 +51,29 @@ def parse_union_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWri
"""
ocr和文本混合的pdf,全部解析出来
"""
pass
\ No newline at end of file
def
parse_pdf
(
method
):
try
:
return
method
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
except
Exception
as
e
:
logger
.
error
(
f
"{method.__name__} error: {e}"
)
return
None
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_txt
)
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"need_drop"
,
False
):
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
if
pdf_info_dict
is
None
:
raise
Exception
(
"Both parse_pdf_by_txt and parse_pdf_by_ocr failed."
)
return
pdf_info_dict
def
spark_json_extractor
(
jso
:
dict
):
pass
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment