Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
779215e1
Commit
779215e1
authored
Apr 09, 2024
by
quyuan
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of
https://github.com/magicpdf/Magic-PDF
parents
cdb318fe
6532b24f
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
160 additions
and
246 deletions
+160
-246
demo_commons.py
demo/demo_commons.py
+2
-2
ocr_demo.py
demo/ocr_demo.py
+33
-4
text_demo.py
demo/text_demo.py
+63
-55
config_reader.py
magic_pdf/libs/config_reader.py
+15
-3
pipeline.bak
magic_pdf/pipeline.bak
+1
-81
pipeline_ocr.bak
magic_pdf/pipeline_ocr.bak
+0
-0
pipeline_txt.bak
magic_pdf/pipeline_txt.bak
+0
-0
base.py
magic_pdf/spark/base.py
+0
-11
s3.bak
magic_pdf/spark/s3.bak
+17
-0
s3.py
magic_pdf/spark/s3.py
+0
-86
test_commons.py
tests/test_commons.py
+29
-4
No files found.
demo/demo_commons.py
View file @
779215e1
import
json
import
json
from
magic_pdf.
spark.s3
import
get_s3_config
from
magic_pdf.
libs.config_reader
import
get_s3_config_dict
from
magic_pdf.libs.commons
import
join_path
,
read_file
,
json_dump_path
from
magic_pdf.libs.commons
import
join_path
,
read_file
,
json_dump_path
...
@@ -16,7 +16,7 @@ def get_json_from_local_or_s3(book_name=None):
...
@@ -16,7 +16,7 @@ def get_json_from_local_or_s3(book_name=None):
# error_log_path & json_dump_path
# error_log_path & json_dump_path
# 可配置从上述两个地址获取源json
# 可配置从上述两个地址获取源json
json_path
=
join_path
(
json_dump_path
,
book_name
+
".json"
)
json_path
=
join_path
(
json_dump_path
,
book_name
+
".json"
)
s3_config
=
get_s3_config
(
json_path
)
s3_config
=
get_s3_config
_dict
(
json_path
)
file_content
=
read_file
(
json_path
,
s3_config
)
file_content
=
read_file
(
json_path
,
s3_config
)
json_str
=
file_content
.
decode
(
"utf-8"
)
json_str
=
file_content
.
decode
(
"utf-8"
)
# logger.info(json_str)
# logger.info(json_str)
...
...
demo/ocr_demo.py
View file @
779215e1
import
json
import
json
import
os
import
os
import
sys
import
time
from
loguru
import
logger
from
loguru
import
logger
from
pathlib
import
Path
from
pathlib
import
Path
from
magic_pdf.
pipeline_ocr
import
ocr_parse_pdf_core
from
magic_pdf.
libs.config_reader
import
get_s3_config_dict
from
magic_pdf.
spark.s3
import
get_s3_config
from
magic_pdf.
pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
demo.demo_commons
import
get_json_from_local_or_s3
from
demo.demo_commons
import
get_json_from_local_or_s3
from
magic_pdf.dict2md.ocr_mkcontent
import
(
from
magic_pdf.dict2md.ocr_mkcontent
import
(
ocr_mk_mm_markdown_with_para
,
ocr_mk_mm_markdown_with_para
,
make_standard_format_with_para
make_standard_format_with_para
)
)
from
magic_pdf.libs.commons
import
join_path
,
read_file
from
magic_pdf.libs.commons
import
join_path
,
read_file
,
formatted_time
def
save_markdown
(
markdown_text
,
input_filepath
):
def
save_markdown
(
markdown_text
,
input_filepath
):
...
@@ -50,7 +52,7 @@ def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
...
@@ -50,7 +52,7 @@ def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
json_object
=
get_json_from_local_or_s3
(
book_name
)
json_object
=
get_json_from_local_or_s3
(
book_name
)
# logger.info(json_object)
# logger.info(json_object)
s3_pdf_path
=
json_object
[
"file_location"
]
s3_pdf_path
=
json_object
[
"file_location"
]
s3_config
=
get_s3_config
(
s3_pdf_path
)
s3_config
=
get_s3_config
_dict
(
s3_pdf_path
)
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_config
)
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_config
)
ocr_pdf_model_info
=
json_object
.
get
(
"doc_layout_result"
)
ocr_pdf_model_info
=
json_object
.
get
(
"doc_layout_result"
)
ocr_parse_core
(
book_name
,
pdf_bytes
,
ocr_pdf_model_info
)
ocr_parse_core
(
book_name
,
pdf_bytes
,
ocr_pdf_model_info
)
...
@@ -83,6 +85,33 @@ def ocr_parse_core(book_name, pdf_bytes, ocr_pdf_model_info, start_page_id=0):
...
@@ -83,6 +85,33 @@ def ocr_parse_core(book_name, pdf_bytes, ocr_pdf_model_info, start_page_id=0):
f
.
write
(
json
.
dumps
(
standard_format
,
ensure_ascii
=
False
))
f
.
write
(
json
.
dumps
(
standard_format
,
ensure_ascii
=
False
))
def
ocr_parse_pdf_core
(
pdf_bytes
,
model_output_json_list
,
book_name
,
start_page_id
=
0
,
debug_mode
=
False
):
start_time
=
time
.
time
()
# 记录开始时间
# 先打印一下book_name和解析开始的时间
logger
.
info
(
f
"book_name is:{book_name},start_time is:{formatted_time(start_time)}"
,
file
=
sys
.
stderr
,
)
pdf_info_dict
=
parse_pdf_by_ocr
(
pdf_bytes
,
model_output_json_list
,
""
,
book_name
,
pdf_model_profile
=
None
,
start_page_id
=
start_page_id
,
debug_mode
=
debug_mode
,
)
end_time
=
time
.
time
()
# 记录完成时间
parse_time
=
int
(
end_time
-
start_time
)
# 计算执行时间
# 解析完成后打印一下book_name和耗时
logger
.
info
(
f
"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}"
,
file
=
sys
.
stderr
,
)
return
pdf_info_dict
,
parse_time
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
...
...
demo/text_demo.py
View file @
779215e1
...
@@ -6,84 +6,93 @@ from pathlib import Path
...
@@ -6,84 +6,93 @@ from pathlib import Path
import
click
import
click
from
demo.demo_commons
import
get_json_from_local_or_s3
,
write_json_to_local
,
local_jsonl_path
,
local_json_path
from
demo.demo_commons
import
get_json_from_local_or_s3
,
write_json_to_local
,
local_jsonl_path
,
local_json_path
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
,
mk_universal_format
from
magic_pdf.pipeline
import
(
from
magic_pdf.filter.pdf_classify_by_type
import
classify
meta_scan
,
classify_by_type
,
parse_pdf
,
pdf_intermediate_dict_to_markdown
,
save_tables_to_s3
,
)
from
magic_pdf.libs.commons
import
join_path
from
loguru
import
logger
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
magic_pdf.libs.commons
import
join_path
,
read_file
from
loguru
import
logger
from
magic_pdf.libs.config_reader
import
get_s3_config_dict
from
magic_pdf.pdf_parse_by_txt
import
parse_pdf_by_txt
from
magic_pdf.spark.base
import
get_data_source
def
demo_parse_pdf
(
book_name
=
None
,
start_page_id
=
0
,
debug_mode
=
True
):
def
demo_parse_pdf
(
book_name
=
None
,
start_page_id
=
0
,
debug_mode
=
True
):
json_object
=
get_json_from_local_or_s3
(
book_name
)
json_object
=
get_json_from_local_or_s3
(
book_name
)
jso
=
parse_pdf
(
json_object
,
start_page_id
=
start_page_id
,
debug_mode
=
debug_mode
)
s3_pdf_path
=
json_object
.
get
(
"file_location"
)
logger
.
info
(
f
"pdf_parse_time: {jso['parse_time']}"
)
s3_config
=
get_s3_config_dict
(
s3_pdf_path
)
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_config
)
write_json_to_local
(
jso
,
book_name
)
model_output_json_list
=
json_object
.
get
(
"doc_layout_result"
)
data_source
=
get_data_source
(
json_object
)
jso_md
=
pdf_intermediate_dict_to_markdown
(
jso
,
debug_mode
=
debug_mode
)
file_id
=
json_object
.
get
(
"file_id"
)
content
=
jso_md
.
get
(
"content_list"
)
junk_img_bojids
=
json_object
[
"pdf_meta"
][
"junk_img_bojids"
]
markdown_content
=
mk_mm_markdown
(
content
)
save_path
=
""
pdf_info_dict
=
parse_pdf_by_txt
(
pdf_bytes
,
model_output_json_list
,
save_path
,
f
"{data_source}/{file_id}"
,
pdf_model_profile
=
None
,
start_page_id
=
start_page_id
,
junk_img_bojids
=
junk_img_bojids
,
debug_mode
=
debug_mode
,
)
write_json_to_local
(
pdf_info_dict
,
book_name
)
content_list
=
mk_universal_format
(
pdf_info_dict
)
markdown_content
=
mk_mm_markdown
(
content_list
)
if
book_name
is
not
None
:
if
book_name
is
not
None
:
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
,
"md"
,
book_name
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
,
"md"
,
book_name
)
uni_format_save_path
=
join_path
(
save_tmp_path
,
"book"
+
".json"
)
uni_format_save_path
=
join_path
(
save_tmp_path
,
"book"
+
".json"
)
markdown_save_path
=
join_path
(
save_tmp_path
,
"book"
+
".md"
)
markdown_save_path
=
join_path
(
save_tmp_path
,
"book"
+
".md"
)
with
open
(
uni_format_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
with
open
(
uni_format_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
json
.
dumps
(
content
,
ensure_ascii
=
False
,
indent
=
4
))
f
.
write
(
json
.
dumps
(
content
_list
,
ensure_ascii
=
False
,
indent
=
4
))
with
open
(
markdown_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
with
open
(
markdown_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
markdown_content
)
f
.
write
(
markdown_content
)
else
:
else
:
logger
.
info
(
json
.
dumps
(
content
,
ensure_ascii
=
False
))
logger
.
info
(
json
.
dumps
(
content_list
,
ensure_ascii
=
False
))
def
demo_save_tables
(
book_name
=
None
,
start_page_id
=
0
,
debug_mode
=
True
):
json_object
=
get_json_from_local_or_s3
(
book_name
)
jso
=
parse_pdf
(
json_object
,
start_page_id
=
start_page_id
,
debug_mode
=
debug_mode
)
logger
.
info
(
f
"pdf_parse_time: {jso['parse_time']}"
)
write_json_to_local
(
jso
,
book_name
)
save_tables_to_s3
(
jso
,
debug_mode
=
debug_mode
)
def
demo_classify_by_type
(
book_name
=
None
,
debug_mode
=
True
):
def
demo_classify_by_type
(
book_name
=
None
,
debug_mode
=
True
):
json_object
=
get_json_from_local_or_s3
(
book_name
)
json_object
=
get_json_from_local_or_s3
(
book_name
)
jso
=
classify_by_type
(
json_object
,
debug_mode
=
debug_mode
)
pdf_meta
=
json_object
.
get
(
"pdf_meta"
)
total_page
=
pdf_meta
[
"total_page"
]
logger
.
info
(
json
.
dumps
(
jso
,
ensure_ascii
=
False
))
page_width
=
pdf_meta
[
"page_width_pts"
]
logger
.
info
(
f
"classify_time: {jso['classify_time']}"
)
page_height
=
pdf_meta
[
"page_height_pts"
]
write_json_to_local
(
jso
,
book_name
)
img_sz_list
=
pdf_meta
[
"image_info_per_page"
]
img_num_list
=
pdf_meta
[
"imgs_per_page"
]
text_len_list
=
pdf_meta
[
"text_len_per_page"
]
text_layout_list
=
pdf_meta
[
"text_layout_per_page"
]
pdf_path
=
json_object
.
get
(
"file_location"
)
is_text_pdf
,
results
=
classify
(
pdf_path
,
total_page
,
page_width
,
page_height
,
img_sz_list
,
text_len_list
,
img_num_list
,
text_layout_list
,
)
logger
.
info
(
f
"is_text_pdf: {is_text_pdf}"
)
logger
.
info
(
json
.
dumps
(
results
,
ensure_ascii
=
False
))
write_json_to_local
(
results
,
book_name
)
def
demo_meta_scan
(
book_name
=
None
,
debug_mode
=
True
):
def
demo_meta_scan
(
book_name
=
None
,
debug_mode
=
True
):
json_object
=
get_json_from_local_or_s3
(
book_name
)
json_object
=
get_json_from_local_or_s3
(
book_name
)
# doc_layout_check=False
s3_pdf_path
=
json_object
.
get
(
"file_location"
)
jso
=
meta_scan
(
json_object
,
doc_layout_check
=
True
)
s3_config
=
get_s3_config_dict
(
s3_pdf_path
)
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_config
)
res
=
pdf_meta_scan
(
s3_pdf_path
,
pdf_bytes
)
logger
.
info
(
json
.
dumps
(
jso
,
ensure_ascii
=
False
))
logger
.
info
(
json
.
dumps
(
res
,
ensure_ascii
=
False
))
logger
.
info
(
f
"meta_scan_time: {jso['meta_scan_time']}"
)
write_json_to_local
(
res
,
book_name
)
write_json_to_local
(
jso
,
book_name
)
def
demo_meta_scan_from_jsonl
():
with
open
(
local_jsonl_path
,
"r"
,
encoding
=
"utf-8"
)
as
jsonl_file
:
for
line
in
jsonl_file
:
jso
=
json
.
loads
(
line
)
jso
=
meta_scan
(
jso
)
logger
.
info
(
f
"pdf_path: {jso['content']['pdf_path']}"
)
logger
.
info
(
f
"read_file_time: {jso['read_file_time']}"
)
logger
.
info
(
f
"meta_scan_time: {jso['meta_scan_time']}"
)
def
demo_test5
():
def
demo_test5
():
...
@@ -94,7 +103,6 @@ def demo_test5():
...
@@ -94,7 +103,6 @@ def demo_test5():
logger
.
info
(
f
"img_list_len: {img_list_len}"
)
logger
.
info
(
f
"img_list_len: {img_list_len}"
)
def
read_more_para_test_samples
(
type
=
"scihub"
):
def
read_more_para_test_samples
(
type
=
"scihub"
):
# 读取多段落测试样本
# 读取多段落测试样本
curr_dir
=
Path
(
__file__
)
.
parent
curr_dir
=
Path
(
__file__
)
.
parent
...
...
magic_pdf/libs/config_reader.py
View file @
779215e1
...
@@ -7,6 +7,8 @@ import os
...
@@ -7,6 +7,8 @@ import os
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.commons
import
parse_bucket_key
def
get_s3_config
(
bucket_name
:
str
):
def
get_s3_config
(
bucket_name
:
str
):
"""
"""
...
@@ -25,8 +27,8 @@ def get_s3_config(bucket_name: str):
...
@@ -25,8 +27,8 @@ def get_s3_config(bucket_name: str):
bucket_info
=
config
.
get
(
"bucket_info"
)
bucket_info
=
config
.
get
(
"bucket_info"
)
if
bucket_name
not
in
bucket_info
:
if
bucket_name
not
in
bucket_info
:
raise
Exception
(
"bucket_name not found in magic-pdf.json"
)
access_key
,
secret_key
,
storage_endpoint
=
bucket_info
[
"[default]"
]
else
:
access_key
,
secret_key
,
storage_endpoint
=
bucket_info
[
bucket_name
]
access_key
,
secret_key
,
storage_endpoint
=
bucket_info
[
bucket_name
]
if
access_key
is
None
or
secret_key
is
None
or
storage_endpoint
is
None
:
if
access_key
is
None
or
secret_key
is
None
or
storage_endpoint
is
None
:
...
@@ -37,5 +39,15 @@ def get_s3_config(bucket_name: str):
...
@@ -37,5 +39,15 @@ def get_s3_config(bucket_name: str):
return
access_key
,
secret_key
,
storage_endpoint
return
access_key
,
secret_key
,
storage_endpoint
def
get_s3_config_dict
(
path
:
str
):
access_key
,
secret_key
,
storage_endpoint
=
get_s3_config
(
get_bucket_name
(
path
))
return
{
"ak"
:
access_key
,
"sk"
:
secret_key
,
"endpoint"
:
storage_endpoint
}
def
get_bucket_name
(
path
):
bucket
,
key
=
parse_bucket_key
(
path
)
return
bucket
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
ak
,
sk
,
endpoint
=
get_s3_config
(
"llm-raw"
)
ak
,
sk
,
endpoint
=
get_s3_config
(
"llm-raw"
)
magic_pdf/pipeline.
py
→
magic_pdf/pipeline.
bak
View file @
779215e1
...
@@ -21,7 +21,7 @@ from loguru import logger
...
@@ -21,7 +21,7 @@ from loguru import logger
from magic_pdf.pdf_parse_for_train import parse_pdf_for_train
from magic_pdf.pdf_parse_for_train import parse_pdf_for_train
from magic_pdf.spark.base import exception_handler, get_data_source
from magic_pdf.spark.base import exception_handler, get_data_source
from magic_pdf.train_utils.convert_to_train_format import convert_to_train_format
from magic_pdf.train_utils.convert_to_train_format import convert_to_train_format
from
magic_pdf.spark.s3
import
get_s3_config
,
get_s3_client
from magic_pdf.spark.s3 import get_s3_config
...
@@ -161,86 +161,6 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
...
@@ -161,86 +161,6 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
return jso
return jso
def
save_tables_to_s3
(
jso
:
dict
,
debug_mode
=
False
)
->
dict
:
if
debug_mode
:
pass
else
:
# 如果debug没开,则检测是否有needdrop字段
if
jso
.
get
(
"need_drop"
,
False
):
logger
.
info
(
f
"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop"
,
file
=
sys
.
stderr
,
)
jso
[
"dropped"
]
=
True
return
jso
try
:
data_source
=
get_data_source
(
jso
)
file_id
=
jso
.
get
(
"file_id"
)
book_name
=
f
"{data_source}/{file_id}"
title
=
jso
.
get
(
"title"
)
url_encode_title
=
quote
(
title
,
safe
=
""
)
if
data_source
!=
"scihub"
:
return
jso
pdf_intermediate_dict
=
jso
[
"pdf_intermediate_dict"
]
# 将 pdf_intermediate_dict 解压
pdf_intermediate_dict
=
JsonCompressor
.
decompress_json
(
pdf_intermediate_dict
)
i
=
0
for
page
in
pdf_intermediate_dict
.
values
():
if
page
.
get
(
"tables"
):
if
len
(
page
[
"tables"
])
>
0
:
j
=
0
for
table
in
page
[
"tables"
]:
if
debug_mode
:
image_path
=
join_path
(
"s3://mllm-raw-media/pdf2md_img/"
,
book_name
,
table
[
"image_path"
],
)
else
:
image_path
=
join_path
(
"s3://mllm-raw-media/pdf2md_img/"
,
table
[
"image_path"
]
)
if
image_path
.
endswith
(
".jpg"
):
j
+=
1
s3_client
=
get_s3_client
(
image_path
)
bucket_name
,
bucket_key
=
parse_bucket_key
(
image_path
)
# 通过s3_client获取图片到内存
image_bytes
=
s3_client
.
get_object
(
Bucket
=
bucket_name
,
Key
=
bucket_key
)[
"Body"
]
.
read
()
# 保存图片到新的位置
if
debug_mode
:
new_image_path
=
join_path
(
"s3://mllm-raw-media/pdf2md_img/table_new/"
,
url_encode_title
+
"_"
+
table
[
"image_path"
]
.
lstrip
(
"tables/"
),
)
else
:
new_image_path
=
join_path
(
"s3://mllm-raw-media/pdf2md_img/table_new/"
,
url_encode_title
+
f
"_page{i}_{j}.jpg"
,
)
logger
.
info
(
new_image_path
,
file
=
sys
.
stderr
)
bucket_name
,
bucket_key
=
parse_bucket_key
(
new_image_path
)
s3_client
.
put_object
(
Bucket
=
bucket_name
,
Key
=
bucket_key
,
Body
=
image_bytes
)
else
:
continue
i
+=
1
# 把无用的信息清空
jso
[
"doc_layout_result"
]
=
""
jso
[
"pdf_intermediate_dict"
]
=
""
jso
[
"pdf_meta"
]
=
""
except
Exception
as
e
:
jso
=
exception_handler
(
jso
,
e
)
return
jso
def drop_needdrop_pdf(jso: dict) -> dict:
def drop_needdrop_pdf(jso: dict) -> dict:
if jso.get("need_drop", False):
if jso.get("need_drop", False):
logger.info(
logger.info(
...
...
magic_pdf/pipeline_ocr.
py
→
magic_pdf/pipeline_ocr.
bak
View file @
779215e1
File moved
magic_pdf/pipeline_txt.
py
→
magic_pdf/pipeline_txt.
bak
View file @
779215e1
File moved
magic_pdf/spark/base.py
View file @
779215e1
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.commons
import
read_file
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.spark.s3
import
get_s3_config
def
get_data_source
(
jso
:
dict
):
def
get_data_source
(
jso
:
dict
):
data_source
=
jso
.
get
(
"data_source"
)
data_source
=
jso
.
get
(
"data_source"
)
...
@@ -41,10 +37,3 @@ def get_bookname(jso: dict):
...
@@ -41,10 +37,3 @@ def get_bookname(jso: dict):
file_id
=
jso
.
get
(
"file_id"
)
file_id
=
jso
.
get
(
"file_id"
)
book_name
=
f
"{data_source}/{file_id}"
book_name
=
f
"{data_source}/{file_id}"
return
book_name
return
book_name
def
get_pdf_bytes
(
jso
:
dict
):
pdf_s3_path
=
jso
.
get
(
"file_location"
)
s3_config
=
get_s3_config
(
pdf_s3_path
)
pdf_bytes
=
read_file
(
pdf_s3_path
,
s3_config
)
return
pdf_bytes
\ No newline at end of file
magic_pdf/spark/s3.bak
0 → 100644
View file @
779215e1
import re
from magic_pdf.libs.config_reader import get_s3_config_dict
__re_s3_path = re.compile("^s3a?://([^/]+)(?:/(.*))?$")
def get_s3_config(path):
bucket_name = split_s3_path(path)[0] if path else ""
return get_s3_config_dict(bucket_name)
def split_s3_path(path: str):
"split bucket and key from path"
m = __re_s3_path.match(path)
if m is None:
return "", ""
return m.group(1), (m.group(2) or "")
magic_pdf/spark/s3.py
deleted
100644 → 0
View file @
cdb318fe
# from app.common import s3
import
boto3
from
botocore.client
import
Config
import
re
import
random
from
typing
import
List
,
Union
try
:
from
app.config
import
s3_buckets
,
s3_clusters
,
s3_users
# TODO delete 循环依赖
from
app.common.runtime
import
get_cluster_name
except
ImportError
:
from
magic_pdf.config
import
s3_buckets
,
s3_clusters
,
get_cluster_name
,
s3_users
__re_s3_path
=
re
.
compile
(
"^s3a?://([^/]+)(?:/(.*))?$"
)
def
get_s3_config
(
path
:
Union
[
str
,
List
[
str
]],
outside
=
False
):
paths
=
[
path
]
if
type
(
path
)
==
str
else
path
bucket_config
=
None
for
p
in
paths
:
bc
=
__get_s3_bucket_config
(
p
)
if
bucket_config
in
[
bc
,
None
]:
bucket_config
=
bc
continue
raise
Exception
(
f
"{paths} have different s3 config, cannot read together."
)
if
not
bucket_config
:
raise
Exception
(
"path is empty."
)
return
__get_s3_config
(
bucket_config
,
outside
,
prefer_ip
=
True
)
def
__get_s3_config
(
bucket_config
:
tuple
,
outside
:
bool
,
prefer_ip
=
False
,
prefer_auto
=
False
,
):
cluster
,
user
=
bucket_config
cluster_config
=
s3_clusters
[
cluster
]
if
outside
:
endpoint_key
=
"outside"
elif
prefer_auto
and
"auto"
in
cluster_config
:
endpoint_key
=
"auto"
elif
cluster_config
.
get
(
"cluster"
)
==
get_cluster_name
():
endpoint_key
=
"inside"
else
:
endpoint_key
=
"outside"
if
prefer_ip
and
f
"{endpoint_key}_ips"
in
cluster_config
:
endpoint_key
=
f
"{endpoint_key}_ips"
endpoints
=
cluster_config
[
endpoint_key
]
endpoint
=
random
.
choice
(
endpoints
)
return
{
"endpoint"
:
endpoint
,
**
s3_users
[
user
]}
def
split_s3_path
(
path
:
str
):
"split bucket and key from path"
m
=
__re_s3_path
.
match
(
path
)
if
m
is
None
:
return
""
,
""
return
m
.
group
(
1
),
(
m
.
group
(
2
)
or
""
)
def
__get_s3_bucket_config
(
path
:
str
):
bucket
=
split_s3_path
(
path
)[
0
]
if
path
else
""
bucket_config
=
s3_buckets
.
get
(
bucket
)
if
not
bucket_config
:
bucket_config
=
s3_buckets
.
get
(
"[default]"
)
assert
bucket_config
is
not
None
return
bucket_config
def
get_s3_client
(
path
:
Union
[
str
,
List
[
str
]],
outside
=
False
):
s3_config
=
get_s3_config
(
path
,
outside
)
try
:
return
boto3
.
client
(
"s3"
,
aws_access_key_id
=
s3_config
[
"ak"
],
aws_secret_access_key
=
s3_config
[
"sk"
],
endpoint_url
=
s3_config
[
"endpoint"
],
config
=
Config
(
s3
=
{
"addressing_style"
:
"path"
},
retries
=
{
"max_attempts"
:
8
,
"mode"
:
"standard"
}),
)
except
:
# older boto3 do not support retries.mode param.
return
boto3
.
client
(
"s3"
,
aws_access_key_id
=
s3_config
[
"ak"
],
aws_secret_access_key
=
s3_config
[
"sk"
],
endpoint_url
=
s3_config
[
"endpoint"
],
config
=
Config
(
s3
=
{
"addressing_style"
:
"path"
},
retries
=
{
"max_attempts"
:
8
}),
)
\ No newline at end of file
tests/test_commons.py
View file @
779215e1
import
io
import
io
import
json
import
json
import
os
import
os
import
boto3
from
botocore.config
import
Config
from
magic_pdf.libs.commons
import
fitz
from
magic_pdf.libs.commons
import
fitz
from
magic_pdf.libs.config_reader
import
get_s3_config_dict
from
magic_pdf.spark.s3
import
get_s3_config
,
get_s3_client
from
magic_pdf.libs.commons
import
join_path
,
json_dump_path
,
read_file
,
parse_bucket_key
from
magic_pdf.libs.commons
import
join_path
,
json_dump_path
,
read_file
,
parse_bucket_key
from
loguru
import
logger
from
loguru
import
logger
...
@@ -12,7 +16,7 @@ test_pdf_dir_path = "s3://llm-pdf-text/unittest/pdf/"
...
@@ -12,7 +16,7 @@ test_pdf_dir_path = "s3://llm-pdf-text/unittest/pdf/"
def
get_test_pdf_json
(
book_name
):
def
get_test_pdf_json
(
book_name
):
json_path
=
join_path
(
json_dump_path
,
book_name
+
".json"
)
json_path
=
join_path
(
json_dump_path
,
book_name
+
".json"
)
s3_config
=
get_s3_config
(
json_path
)
s3_config
=
get_s3_config
_dict
(
json_path
)
file_content
=
read_file
(
json_path
,
s3_config
)
file_content
=
read_file
(
json_path
,
s3_config
)
json_str
=
file_content
.
decode
(
'utf-8'
)
json_str
=
file_content
.
decode
(
'utf-8'
)
json_object
=
json
.
loads
(
json_str
)
json_object
=
json
.
loads
(
json_str
)
...
@@ -21,7 +25,7 @@ def get_test_pdf_json(book_name):
...
@@ -21,7 +25,7 @@ def get_test_pdf_json(book_name):
def
read_test_file
(
book_name
):
def
read_test_file
(
book_name
):
test_pdf_path
=
join_path
(
test_pdf_dir_path
,
book_name
+
".pdf"
)
test_pdf_path
=
join_path
(
test_pdf_dir_path
,
book_name
+
".pdf"
)
s3_config
=
get_s3_config
(
test_pdf_path
)
s3_config
=
get_s3_config
_dict
(
test_pdf_path
)
try
:
try
:
file_content
=
read_file
(
test_pdf_path
,
s3_config
)
file_content
=
read_file
(
test_pdf_path
,
s3_config
)
return
file_content
return
file_content
...
@@ -31,7 +35,7 @@ def read_test_file(book_name):
...
@@ -31,7 +35,7 @@ def read_test_file(book_name):
try
:
try
:
json_object
=
get_test_pdf_json
(
book_name
)
json_object
=
get_test_pdf_json
(
book_name
)
orig_s3_pdf_path
=
json_object
.
get
(
'file_location'
)
orig_s3_pdf_path
=
json_object
.
get
(
'file_location'
)
s3_config
=
get_s3_config
(
orig_s3_pdf_path
)
s3_config
=
get_s3_config
_dict
(
orig_s3_pdf_path
)
file_content
=
read_file
(
orig_s3_pdf_path
,
s3_config
)
file_content
=
read_file
(
orig_s3_pdf_path
,
s3_config
)
s3_client
=
get_s3_client
(
test_pdf_path
)
s3_client
=
get_s3_client
(
test_pdf_path
)
bucket_name
,
bucket_key
=
parse_bucket_key
(
test_pdf_path
)
bucket_name
,
bucket_key
=
parse_bucket_key
(
test_pdf_path
)
...
@@ -53,3 +57,24 @@ def get_test_json_data(directory_path, json_file_name):
...
@@ -53,3 +57,24 @@ def get_test_json_data(directory_path, json_file_name):
with
open
(
os
.
path
.
join
(
directory_path
,
json_file_name
),
"r"
,
encoding
=
'utf-8'
)
as
f
:
with
open
(
os
.
path
.
join
(
directory_path
,
json_file_name
),
"r"
,
encoding
=
'utf-8'
)
as
f
:
test_data
=
json
.
load
(
f
)
test_data
=
json
.
load
(
f
)
return
test_data
return
test_data
def
get_s3_client
(
path
):
s3_config
=
get_s3_config_dict
(
path
)
try
:
return
boto3
.
client
(
"s3"
,
aws_access_key_id
=
s3_config
[
"ak"
],
aws_secret_access_key
=
s3_config
[
"sk"
],
endpoint_url
=
s3_config
[
"endpoint"
],
config
=
Config
(
s3
=
{
"addressing_style"
:
"path"
},
retries
=
{
"max_attempts"
:
8
,
"mode"
:
"standard"
}),
)
except
:
# older boto3 do not support retries.mode param.
return
boto3
.
client
(
"s3"
,
aws_access_key_id
=
s3_config
[
"ak"
],
aws_secret_access_key
=
s3_config
[
"sk"
],
endpoint_url
=
s3_config
[
"endpoint"
],
config
=
Config
(
s3
=
{
"addressing_style"
:
"path"
},
retries
=
{
"max_attempts"
:
8
}),
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment