Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
338c6814
Commit
338c6814
authored
Nov 01, 2024
by
icecraft
Committed by
xu rui
Nov 01, 2024
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat: add more unittest
parent
47db844c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
138 additions
and
7 deletions
+138
-7
s3.py
magic_pdf/data/data_reader_writer/s3.py
+2
-2
test_multi_bucket_s3.py
tests/test_data/data_reader_writer/test_multi_bucket_s3.py
+80
-2
test_s3.py
tests/test_data/data_reader_writer/test_s3.py
+56
-3
No files found.
magic_pdf/data/data_reader_writer/s3.py
View file @
338c6814
...
...
@@ -25,7 +25,7 @@ class S3DataReader(MultiBucketS3DataReader):
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
"""
super
()
.
__init__
(
f
"{bucket}/{default_prefix_without_bucket}"
f
'{bucket}/{default_prefix_without_bucket}'
,
[
S3Config
(
bucket_name
=
bucket
,
...
...
@@ -60,7 +60,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
"""
super
()
.
__init__
(
f
"{bucket}/{default_prefix_without_bucket}"
f
'{bucket}/{default_prefix_without_bucket}'
,
[
S3Config
(
bucket_name
=
bucket
,
...
...
tests/test_data/data_reader_writer/test_multi_bucket_s3.py
View file @
338c6814
...
...
@@ -41,8 +41,8 @@ def test_multi_bucket_s3_reader_writer():
),
]
reader
=
MultiBucketS3DataReader
(
default_bucket
=
bucket
,
s3_configs
=
s3configs
)
writer
=
MultiBucketS3DataWriter
(
default_bucket
=
bucket
,
s3_configs
=
s3configs
)
reader
=
MultiBucketS3DataReader
(
bucket
,
s3configs
)
writer
=
MultiBucketS3DataWriter
(
bucket
,
s3configs
)
bits
=
reader
.
read
(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
...
...
@@ -80,3 +80,81 @@ def test_multi_bucket_s3_reader_writer():
assert
'123'
.
encode
()
==
reader
.
read
(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@
pytest
.
mark
.
skipif
(
os
.
getenv
(
'S3_ACCESS_KEY_2'
,
None
)
is
None
,
reason
=
'need s3 config!'
)
def
test_multi_bucket_s3_reader_writer_with_prefix
():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket
=
os
.
getenv
(
'S3_BUCKET'
,
''
)
ak
=
os
.
getenv
(
'S3_ACCESS_KEY'
,
''
)
sk
=
os
.
getenv
(
'S3_SECRET_KEY'
,
''
)
endpoint_url
=
os
.
getenv
(
'S3_ENDPOINT'
,
''
)
bucket_2
=
os
.
getenv
(
'S3_BUCKET_2'
,
''
)
ak_2
=
os
.
getenv
(
'S3_ACCESS_KEY_2'
,
''
)
sk_2
=
os
.
getenv
(
'S3_SECRET_KEY_2'
,
''
)
endpoint_url_2
=
os
.
getenv
(
'S3_ENDPOINT_2'
,
''
)
s3configs
=
[
S3Config
(
bucket_name
=
bucket
,
access_key
=
ak
,
secret_key
=
sk
,
endpoint_url
=
endpoint_url
),
S3Config
(
bucket_name
=
bucket_2
,
access_key
=
ak_2
,
secret_key
=
sk_2
,
endpoint_url
=
endpoint_url_2
,
),
]
prefix
=
'meta-index'
reader
=
MultiBucketS3DataReader
(
f
'{bucket}/{prefix}'
,
s3configs
)
writer
=
MultiBucketS3DataWriter
(
f
'{bucket}/{prefix}'
,
s3configs
)
bits
=
reader
.
read
(
'scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
assert
bits
==
reader
.
read
(
f
's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits
=
reader
.
read
(
f
's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
)
docs
=
fitz
.
open
(
'pdf'
,
bits
)
assert
len
(
docs
)
==
10
bits
=
reader
.
read
(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert
bits
==
reader
.
read_at
(
'scihub/v001/scihub/part-66210c190659-000026.jsonl'
,
566
,
713
)
assert
len
(
json
.
loads
(
bits
))
>
0
writer
.
write_string
(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
,
'abc'
)
assert
'abc'
.
encode
()
==
reader
.
read
(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert
'abc'
.
encode
()
==
reader
.
read
(
f
's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer
.
write
(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
,
'123'
.
encode
(),
)
assert
'123'
.
encode
()
==
reader
.
read
(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
tests/test_data/data_reader_writer/test_s3.py
View file @
338c6814
...
...
@@ -9,7 +9,7 @@ from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
@
pytest
.
mark
.
skipif
(
os
.
getenv
(
'S3_ACCESS_KEY'
,
None
)
is
None
,
reason
=
'need s3 config!'
)
def
test_
multi_bucket_
s3_reader_writer
():
def
test_s3_reader_writer
():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
...
...
@@ -18,8 +18,8 @@ def test_multi_bucket_s3_reader_writer():
sk
=
os
.
getenv
(
'S3_SECRET_KEY'
,
''
)
endpoint_url
=
os
.
getenv
(
'S3_ENDPOINT'
,
''
)
reader
=
S3DataReader
(
bucket
=
bucket
,
ak
=
ak
,
sk
=
sk
,
endpoint_url
=
endpoint_url
)
writer
=
S3DataWriter
(
bucket
=
bucket
,
ak
=
ak
,
sk
=
sk
,
endpoint_url
=
endpoint_url
)
reader
=
S3DataReader
(
''
,
bucket
,
ak
,
sk
,
endpoint_url
)
writer
=
S3DataWriter
(
''
,
bucket
,
ak
,
sk
,
endpoint_url
)
bits
=
reader
.
read
(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
...
...
@@ -51,3 +51,56 @@ def test_multi_bucket_s3_reader_writer():
assert
'123'
.
encode
()
==
reader
.
read
(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@
pytest
.
mark
.
skipif
(
os
.
getenv
(
'S3_ACCESS_KEY'
,
None
)
is
None
,
reason
=
'need s3 config!'
)
def
test_s3_reader_writer_with_prefix
():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
bucket
=
os
.
getenv
(
'S3_BUCKET'
,
''
)
ak
=
os
.
getenv
(
'S3_ACCESS_KEY'
,
''
)
sk
=
os
.
getenv
(
'S3_SECRET_KEY'
,
''
)
endpoint_url
=
os
.
getenv
(
'S3_ENDPOINT'
,
''
)
prefix
=
'meta-index'
reader
=
S3DataReader
(
prefix
,
bucket
,
ak
,
sk
,
endpoint_url
)
writer
=
S3DataWriter
(
prefix
,
bucket
,
ak
,
sk
,
endpoint_url
)
bits
=
reader
.
read
(
'scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
assert
bits
==
reader
.
read
(
f
's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits
=
reader
.
read
(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert
bits
==
reader
.
read_at
(
'scihub/v001/scihub/part-66210c190659-000026.jsonl'
,
566
,
713
)
assert
len
(
json
.
loads
(
bits
))
>
0
writer
.
write_string
(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
,
'abc'
)
assert
'abc'
.
encode
()
==
reader
.
read
(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert
'abc'
.
encode
()
==
reader
.
read
(
f
's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer
.
write
(
f
'{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
,
'123'
.
encode
(),
)
assert
'123'
.
encode
()
==
reader
.
read
(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment