Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
8f65af9f
Commit
8f65af9f
authored
Apr 09, 2024
by
liukaiwen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
io modules
parent
cfac3b25
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
126 additions
and
20 deletions
+126
-20
AbsReaderWriter.py
magic_pdf/io/AbsReaderWriter.py
+10
-8
DiskReaderWriter.py
magic_pdf/io/DiskReaderWriter.py
+47
-0
S3ReaderWriter.py
magic_pdf/io/S3ReaderWriter.py
+69
-12
No files found.
magic_pdf/io/AbsReaderWriter.py
View file @
8f65af9f
from
abc
import
ABC
,
abstractmethod
class
AbsReaderWriter
(
ABC
):
"""
同时支持二进制和文本读写的抽象类
TODO
"""
def
__init__
(
self
):
# 初始化代码可以在这里添加,如果需要的话
pass
@
abstractmethod
def
read
(
self
,
path
:
str
):
def
read
(
self
,
path
:
str
,
mode
=
"text"
):
pass
@
abstractmethod
def
write
(
self
,
path
:
str
,
content
:
str
):
def
write
(
self
,
content
:
str
,
path
:
str
,
mode
=
"text"
):
pass
\ No newline at end of file
magic_pdf/io/DiskReaderWriter.py
View file @
8f65af9f
import
os
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
class
DiskReaderWriter
(
AbsReaderWriter
):
def
__init__
(
self
,
parent_path
,
encoding
=
'utf-8'
):
self
.
path
=
parent_path
self
.
encoding
=
encoding
def
read
(
self
,
mode
=
"text"
):
if
not
os
.
path
.
exists
(
self
.
path
):
print
(
f
"文件 {self.path} 不存在"
)
return
None
if
mode
==
"text"
:
with
open
(
self
.
path
,
'r'
,
encoding
=
self
.
encoding
)
as
f
:
return
f
.
read
()
elif
mode
==
"binary"
:
with
open
(
self
.
path
,
'rb'
)
as
f
:
return
f
.
read
()
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
def
write
(
self
,
data
,
mode
=
"text"
):
if
mode
==
"text"
:
with
open
(
self
.
path
,
'w'
,
encoding
=
self
.
encoding
)
as
f
:
f
.
write
(
data
)
print
(
f
"内容已成功写入 {self.path}"
)
elif
mode
==
"binary"
:
with
open
(
self
.
path
,
'wb'
)
as
f
:
f
.
write
(
data
)
print
(
f
"内容已成功写入 {self.path}"
)
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
# 使用示例
if
__name__
==
"__main__"
:
file_path
=
"example.txt"
drw
=
DiskReaderWriter
(
file_path
)
# 写入内容到文件
drw
.
write
(
b
"Hello, World!"
,
mode
=
"binary"
)
# 从文件读取内容
content
=
drw
.
read
()
if
content
:
print
(
f
"从 {file_path} 读取的内容: {content}"
)
magic_pdf/io/S3ReaderWriter.py
View file @
8f65af9f
from
magic_pdf.io
import
AbsReaderWriter
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.commons
import
parse_aws_param
,
parse_bucket_key
import
boto3
from
loguru
import
logger
from
boto3.s3.transfer
import
TransferConfig
from
botocore.config
import
Config
class
DiskReaderWriter
(
AbsReaderWriter
):
def
__init__
(
self
,
parent_path
,
encoding
=
'utf-8'
):
self
.
path
=
parent_path
self
.
encoding
=
encoding
class
S3ReaderWriter
(
AbsReaderWriter
):
def
__init__
(
self
,
s3_profile
):
self
.
client
=
self
.
_get_client
(
s3_profile
)
def
read
(
self
):
with
open
(
self
.
path
,
'rb'
)
as
f
:
return
f
.
read
()
def
_get_client
(
self
,
s3_profile
):
def
write
(
self
,
data
):
with
open
(
self
.
path
,
'wb'
)
as
f
:
f
.
write
(
data
)
\ No newline at end of file
ak
,
sk
,
end_point
,
addressing_style
=
parse_aws_param
(
s3_profile
)
s3_client
=
boto3
.
client
(
service_name
=
"s3"
,
aws_access_key_id
=
ak
,
aws_secret_access_key
=
sk
,
endpoint_url
=
end_point
,
config
=
Config
(
s3
=
{
"addressing_style"
:
addressing_style
},
retries
=
{
'max_attempts'
:
5
,
'mode'
:
'standard'
}),
)
return
s3_client
def
read
(
self
,
s3_path
,
mode
=
"text"
,
encoding
=
"utf-8"
):
bucket_name
,
bucket_key
=
parse_bucket_key
(
s3_path
)
res
=
self
.
client
.
get_object
(
Bucket
=
bucket_name
,
Key
=
bucket_key
)
body
=
res
[
"Body"
]
.
read
()
if
mode
==
'text'
:
data
=
body
.
decode
(
encoding
)
# Decode bytes to text
elif
mode
==
'binary'
:
data
=
body
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
return
data
def
write
(
self
,
data
,
s3_path
,
mode
=
"text"
,
encoding
=
"utf-8"
):
if
mode
==
'text'
:
body
=
data
.
encode
(
encoding
)
# Encode text data as bytes
elif
mode
==
'binary'
:
body
=
data
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
bucket_name
,
bucket_key
=
parse_bucket_key
(
s3_path
)
self
.
client
.
put_object
(
Body
=
body
,
Bucket
=
bucket_name
,
Key
=
bucket_key
)
if
__name__
==
"__main__"
:
# Config the connection info
profile
=
{
'ak'
:
''
,
'sk'
:
''
,
'endpoint'
:
''
}
# Create an S3ReaderWriter object
s3_reader_writer
=
S3ReaderWriter
(
profile
)
# Write text data to S3
text_data
=
"This is some text data"
s3_reader_writer
.
write
(
data
=
text_data
,
s3_path
=
"s3://bucket_name/ebook/test/test.json"
,
mode
=
'text'
)
# Read text data from S3
text_data_read
=
s3_reader_writer
.
read
(
s3_path
=
"s3://bucket_name/ebook/test/test.json"
,
mode
=
'text'
)
print
(
f
"Read text data from S3: {text_data_read}"
)
# Write binary data to S3
binary_data
=
b
"This is some binary data"
s3_reader_writer
.
write
(
data
=
text_data
,
s3_path
=
"s3://bucket_name/ebook/test/test2.json"
,
mode
=
'binary'
)
# Read binary data from S3
binary_data_read
=
s3_reader_writer
.
read
(
s3_path
=
"s3://bucket_name/ebook/test/test2.json"
,
mode
=
'binary'
)
print
(
f
"Read binary data from S3: {binary_data_read}"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment