Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
4de8810c
Commit
4de8810c
authored
Apr 15, 2024
by
kernel.h@qq.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
84a87899
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
6 additions
and
4 deletions
+6
-4
magicpdf.py
magic_pdf/cli/magicpdf.py
+1
-1
config_reader.py
magic_pdf/libs/config_reader.py
+1
-1
pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+4
-2
No files found.
magic_pdf/cli/magicpdf.py
View file @
4de8810c
...
@@ -132,7 +132,7 @@ def pdf_command(pdf, model, method):
...
@@ -132,7 +132,7 @@ def pdf_command(pdf, model, method):
local_image_dir
,
_
=
prepare_env
()
local_image_dir
,
_
=
prepare_env
()
local_image_rw
=
DiskReaderWriter
(
local_image_dir
)
local_image_rw
=
DiskReaderWriter
(
local_image_dir
)
parse
=
get_pdf_parse_method
(
method
)
parse
=
get_pdf_parse_method
(
method
)
parse
(
pdf_data
,
jso
[
"doc_layout_result"
]
,
local_image_rw
,
is_debug
=
True
)
parse
(
pdf_data
,
jso
,
local_image_rw
,
is_debug
=
True
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
magic_pdf/libs/config_reader.py
View file @
4de8810c
...
@@ -17,7 +17,7 @@ def read_config():
...
@@ -17,7 +17,7 @@ def read_config():
config_file
=
os
.
path
.
join
(
home_dir
,
"magic-pdf.json"
)
config_file
=
os
.
path
.
join
(
home_dir
,
"magic-pdf.json"
)
if
not
os
.
path
.
exists
(
config_file
):
if
not
os
.
path
.
exists
(
config_file
):
raise
Exception
(
"magic-pdf.json
not found"
)
raise
Exception
(
f
"{config_file}
not found"
)
with
open
(
config_file
,
"r"
)
as
f
:
with
open
(
config_file
,
"r"
)
as
f
:
config
=
json
.
load
(
f
)
config
=
json
.
load
(
f
)
...
...
magic_pdf/libs/pdf_image_tools.py
View file @
4de8810c
from
magic_pdf.io
import
AbsReaderWriter
from
magic_pdf.libs.commons
import
fitz
from
magic_pdf.libs.commons
import
fitz
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.hash_utils
import
compute_sha256
from
magic_pdf.libs.hash_utils
import
compute_sha256
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
):
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
:
AbsReaderWriter
):
"""
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
...
@@ -28,6 +29,7 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
...
@@ -28,6 +29,7 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
byte_data
=
pix
.
tobytes
(
output
=
'jpeg'
,
jpg_quality
=
95
)
byte_data
=
pix
.
tobytes
(
output
=
'jpeg'
,
jpg_quality
=
95
)
imageWriter
.
write
(
byte_data
,
path
=
img_hash256_path
,
mode
=
"binary"
)
imageWriter
.
write
(
content
=
byte_data
,
path
=
img_hash256_path
,
mode
=
"binary"
)
imageWriter
.
write
(
content
=
byte_data
,
path
=
img_hash256_path
,
mode
=
"binary"
)
return
img_hash256_path
return
img_hash256_path
...
@@ -73,4 +75,4 @@ def save_images_by_bboxes(page_num: int, page: fitz.Page, pdf_bytes_md5: str,
...
@@ -73,4 +75,4 @@ def save_images_by_bboxes(page_num: int, page: fitz.Page, pdf_bytes_md5: str,
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment