Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
90216330
Commit
90216330
authored
Apr 11, 2024
by
Shuimo
Browse files
Options
Browse Files
Download
Plain Diff
update the text_badcase script and add auto upload s3 function
parents
ad8941e3
ff8f62aa
Changes
16
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
272 additions
and
350 deletions
+272
-350
benchmark.yml
.github/workflows/benchmark.yml
+19
-6
ocr_demo.py
demo/ocr_demo.py
+4
-3
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+4
-5
AbsReaderWriter.py
magic_pdf/io/AbsReaderWriter.py
+2
-2
DiskReaderWriter.py
magic_pdf/io/DiskReaderWriter.py
+36
-21
S3ReaderWriter.py
magic_pdf/io/S3ReaderWriter.py
+58
-23
commons.py
magic_pdf/libs/commons.py
+4
-24
hash_utils.py
magic_pdf/libs/hash_utils.py
+15
-0
pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+31
-92
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+6
-50
pdf_parse_by_txt.py
magic_pdf/pdf_parse_by_txt.py
+27
-110
pdf_parse_for_train.py
magic_pdf/pdf_parse_for_train.py
+1
-2
detect_footnote.py
magic_pdf/pre_proc/detect_footnote.py
+1
-1
ocr_cut_image.py
magic_pdf/pre_proc/ocr_cut_image.py
+5
-7
spark_api.py
magic_pdf/spark/spark_api.py
+55
-4
ocr_badcase.py
tools/ocr_badcase.py
+4
-0
No files found.
.github/workflows/benchmark.yml
View file @
90216330
...
@@ -5,7 +5,13 @@ name: PDF
...
@@ -5,7 +5,13 @@ name: PDF
on
:
on
:
push
:
push
:
branches
:
branches
:
-
master
-
"
master"
paths-ignore
:
-
"
cmds/**"
-
"
**.md"
pull_request
:
branches
:
-
"
master"
paths-ignore
:
paths-ignore
:
-
"
cmds/**"
-
"
cmds/**"
-
"
**.md"
-
"
**.md"
...
@@ -18,14 +24,16 @@ jobs:
...
@@ -18,14 +24,16 @@ jobs:
fail-fast
:
true
fail-fast
:
true
steps
:
steps
:
-
name
:
config-net
run
:
|
export http_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
export https_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
-
name
:
PDF benchmark
-
name
:
PDF benchmark
uses
:
actions/checkout@v3
uses
:
actions/checkout@v3
with
:
with
:
fetch-depth
:
2
fetch-depth
:
2
-
name
:
check-requirements
-
name
:
check-requirements
run
:
|
run
:
|
export http_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
export https_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
changed_files=$(git diff --name-only -r HEAD~1 HEAD)
changed_files=$(git diff --name-only -r HEAD~1 HEAD)
echo $changed_files
echo $changed_files
if [[ $changed_files =~ "requirements.txt" ]]; then
if [[ $changed_files =~ "requirements.txt" ]]; then
...
@@ -36,12 +44,17 @@ jobs:
...
@@ -36,12 +44,17 @@ jobs:
-
name
:
benchmark
-
name
:
benchmark
run
:
|
run
:
|
echo "start test"
echo "start test"
cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip
output
.json
cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip
badcase.json overall.json base_data
.json
notify_to_feishu
:
notify_to_feishu
:
if
:
${{ (github.ref_name == 'master') }}
if
:
${{
always() && !cancelled() && contains(needs.*.result, 'failure') &&
(github.ref_name == 'master') }}
needs
:
[
pdf-test
]
needs
:
[
pdf-test
]
runs-on
:
[
pdf
]
runs-on
:
[
pdf
]
steps
:
steps
:
-
name
:
notify
-
name
:
notify
run
:
|
run
:
|
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}
curl ${{ secrets.WEBHOOK_URL }} -H 'Content-Type: application/json' -d '{
"msgtype": "text",
"text": {
"content": "'${{ github.repository }}' GitHubAction Failed!\n 细节请查看:https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"
}
}'
demo/ocr_demo.py
View file @
90216330
...
@@ -115,8 +115,9 @@ def ocr_parse_pdf_core(pdf_bytes, model_output_json_list, book_name, start_page_
...
@@ -115,8 +115,9 @@ def ocr_parse_pdf_core(pdf_bytes, model_output_json_list, book_name, start_page_
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
ocr_local_parse
(
pdf_path
,
json_file_path
)
# ocr_local_parse(pdf_path, json_file_path)
# book_name = "科数网/edu_00011318"
# ocr_online_parse(book_name)
book_name
=
"数学新星网/edu_00001236"
ocr_online_parse
(
book_name
)
pass
pass
magic_pdf/dict2md/ocr_mkcontent.py
View file @
90216330
from
magic_pdf.libs.commons
import
s3_image_save_path
,
join_path
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
...
@@ -56,7 +55,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
...
@@ -56,7 +55,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
if
not
span
.
get
(
'image_path'
):
if
not
span
.
get
(
'image_path'
):
continue
continue
else
:
else
:
content
=
f
"
})"
content
=
f
""
else
:
else
:
content
=
ocr_escape_special_markdown_char
(
span
[
'content'
])
# 转义特殊符号
content
=
ocr_escape_special_markdown_char
(
span
[
'content'
])
# 转义特殊符号
if
span
[
'type'
]
==
ContentType
.
InlineEquation
:
if
span
[
'type'
]
==
ContentType
.
InlineEquation
:
...
@@ -123,7 +122,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
...
@@ -123,7 +122,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
if
mode
==
'mm'
:
if
mode
==
'mm'
:
content
=
f
"
\n

})
\n
"
content
=
f
"
\n

\n
"
elif
mode
==
'nlp'
:
elif
mode
==
'nlp'
:
pass
pass
if
content
!=
''
:
if
content
!=
''
:
...
@@ -195,13 +194,13 @@ def line_to_standard_format(line):
...
@@ -195,13 +194,13 @@ def line_to_standard_format(line):
if
span
[
'type'
]
==
ContentType
.
Image
:
if
span
[
'type'
]
==
ContentType
.
Image
:
content
=
{
content
=
{
'type'
:
'image'
,
'type'
:
'image'
,
'img_path'
:
join_path
(
s3_image_save_path
,
span
[
'image_path'
])
'img_path'
:
span
[
'image_path'
]
}
}
return
content
return
content
elif
span
[
'type'
]
==
ContentType
.
Table
:
elif
span
[
'type'
]
==
ContentType
.
Table
:
content
=
{
content
=
{
'type'
:
'table'
,
'type'
:
'table'
,
'img_path'
:
join_path
(
s3_image_save_path
,
span
[
'image_path'
])
'img_path'
:
span
[
'image_path'
]
}
}
return
content
return
content
else
:
else
:
...
...
magic_pdf/io/AbsReaderWriter.py
View file @
90216330
...
@@ -10,10 +10,10 @@ class AbsReaderWriter(ABC):
...
@@ -10,10 +10,10 @@ class AbsReaderWriter(ABC):
def
__init__
(
self
,
parent_path
):
def
__init__
(
self
,
parent_path
):
# 初始化代码可以在这里添加,如果需要的话
# 初始化代码可以在这里添加,如果需要的话
self
.
parent_path
=
parent_path
# 对于本地目录是父目录,对于s3是会写到这个
ap
th下。
self
.
parent_path
=
parent_path
# 对于本地目录是父目录,对于s3是会写到这个
pa
th下。
@
abstractmethod
@
abstractmethod
def
read
(
self
,
path
:
str
,
mode
=
"text"
):
def
read
(
self
,
path
:
str
,
mode
=
MODE_TXT
):
"""
"""
无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
"""
"""
...
...
magic_pdf/io/DiskReaderWriter.py
View file @
90216330
import
os
import
os
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
loguru
import
logger
from
loguru
import
logger
MODE_TXT
=
"text"
MODE_BIN
=
"binary"
class
DiskReaderWriter
(
AbsReaderWriter
):
class
DiskReaderWriter
(
AbsReaderWriter
):
def
__init__
(
self
,
parent_path
,
encoding
=
'utf-8'
):
def
__init__
(
self
,
parent_path
,
encoding
=
'utf-8'
):
self
.
path
=
parent_path
self
.
path
=
parent_path
self
.
encoding
=
encoding
self
.
encoding
=
encoding
def
read
(
self
,
mode
=
"text"
):
def
read
(
self
,
path
,
mode
=
MODE_TXT
):
if
not
os
.
path
.
exists
(
self
.
path
):
if
os
.
path
.
isabs
(
path
):
logger
.
error
(
f
"文件 {self.path} 不存在"
)
abspath
=
path
raise
Exception
(
f
"文件 {self.path} 不存在"
)
else
:
if
mode
==
"text"
:
abspath
=
os
.
path
.
join
(
self
.
path
,
path
)
with
open
(
self
.
path
,
'r'
,
encoding
=
self
.
encoding
)
as
f
:
if
not
os
.
path
.
exists
(
abspath
):
logger
.
error
(
f
"文件 {abspath} 不存在"
)
raise
Exception
(
f
"文件 {abspath} 不存在"
)
if
mode
==
MODE_TXT
:
with
open
(
abspath
,
'r'
,
encoding
=
self
.
encoding
)
as
f
:
return
f
.
read
()
return
f
.
read
()
elif
mode
==
"binary"
:
elif
mode
==
MODE_BIN
:
with
open
(
self
.
path
,
'rb'
)
as
f
:
with
open
(
abs
path
,
'rb'
)
as
f
:
return
f
.
read
()
return
f
.
read
()
else
:
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
def
write
(
self
,
data
,
mode
=
"text"
):
def
write
(
self
,
content
,
path
,
mode
=
MODE_TXT
):
if
mode
==
"text"
:
if
os
.
path
.
isabs
(
path
):
with
open
(
self
.
path
,
'w'
,
encoding
=
self
.
encoding
)
as
f
:
abspath
=
path
f
.
write
(
data
)
else
:
logger
.
info
(
f
"内容已成功写入 {self.path}"
)
abspath
=
os
.
path
.
join
(
self
.
path
,
path
)
if
mode
==
MODE_TXT
:
with
open
(
abspath
,
'w'
,
encoding
=
self
.
encoding
)
as
f
:
f
.
write
(
content
)
logger
.
info
(
f
"内容已成功写入 {abspath}"
)
elif
mode
==
"binary"
:
elif
mode
==
MODE_BIN
:
with
open
(
self
.
path
,
'wb'
)
as
f
:
with
open
(
abs
path
,
'wb'
)
as
f
:
f
.
write
(
data
)
f
.
write
(
content
)
logger
.
info
(
f
"内容已成功写入 {
self.
path}"
)
logger
.
info
(
f
"内容已成功写入 {
abs
path}"
)
else
:
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
def
read_jsonl
(
self
,
path
:
str
,
byte_start
=
0
,
byte_end
=
None
,
encoding
=
'utf-8'
):
return
self
.
read
(
path
)
# 使用示例
# 使用示例
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
file_path
=
"example.txt"
file_path
=
"
io/
example.txt"
drw
=
DiskReaderWriter
(
file_path
)
drw
=
DiskReaderWriter
(
"D:
\
projects
\
papayfork
\
Magic-PDF
\
magic_pdf"
)
# 写入内容到文件
# 写入内容到文件
drw
.
write
(
b
"Hello, World!"
,
mode
=
"binary"
)
drw
.
write
(
b
"Hello, World!"
,
path
=
"io/example.txt"
,
mode
=
"binary"
)
# 从文件读取内容
# 从文件读取内容
content
=
drw
.
read
()
content
=
drw
.
read
(
path
=
file_path
)
if
content
:
if
content
:
logger
.
info
(
f
"从 {file_path} 读取的内容: {content}"
)
logger
.
info
(
f
"从 {file_path} 读取的内容: {content}"
)
...
...
magic_pdf/io/S3ReaderWriter.py
View file @
90216330
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.commons
import
parse_aws_param
,
parse_bucket_key
from
magic_pdf.libs.commons
import
parse_aws_param
,
parse_bucket_key
import
boto3
import
boto3
from
loguru
import
logger
from
loguru
import
logger
from
boto3.s3.transfer
import
TransferConfig
from
boto3.s3.transfer
import
TransferConfig
from
botocore.config
import
Config
from
botocore.config
import
Config
import
os
MODE_TXT
=
"text"
MODE_BIN
=
"binary"
class
S3ReaderWriter
(
AbsReaderWriter
):
class
S3ReaderWriter
(
AbsReaderWriter
):
def
__init__
(
self
,
ak
:
str
,
sk
:
str
,
endpoint_url
:
str
,
addressing_style
:
str
):
def
__init__
(
self
,
ak
:
str
,
sk
:
str
,
endpoint_url
:
str
,
addressing_style
:
str
,
parent_path
:
str
):
self
.
client
=
self
.
_get_client
(
ak
,
sk
,
endpoint_url
,
addressing_style
)
self
.
client
=
self
.
_get_client
(
ak
,
sk
,
endpoint_url
,
addressing_style
)
self
.
path
=
parent_path
def
_get_client
(
self
,
ak
:
str
,
sk
:
str
,
endpoint_url
:
str
,
addressing_style
:
str
):
def
_get_client
(
self
,
ak
:
str
,
sk
:
str
,
endpoint_url
:
str
,
addressing_style
:
str
):
s3_client
=
boto3
.
client
(
s3_client
=
boto3
.
client
(
...
@@ -22,51 +25,83 @@ class S3ReaderWriter(AbsReaderWriter):
...
@@ -22,51 +25,83 @@ class S3ReaderWriter(AbsReaderWriter):
retries
=
{
'max_attempts'
:
5
,
'mode'
:
'standard'
}),
retries
=
{
'max_attempts'
:
5
,
'mode'
:
'standard'
}),
)
)
return
s3_client
return
s3_client
def
read
(
self
,
s3_path
,
mode
=
"text"
,
encoding
=
"utf-8"
):
bucket_name
,
bucket_key
=
parse_bucket_key
(
s3_path
)
def
read
(
self
,
s3_relative_path
,
mode
=
MODE_TXT
,
encoding
=
"utf-8"
):
res
=
self
.
client
.
get_object
(
Bucket
=
bucket_name
,
Key
=
bucket_key
)
if
s3_relative_path
.
startswith
(
"s3://"
):
s3_path
=
s3_relative_path
else
:
s3_path
=
os
.
path
.
join
(
self
.
path
,
s3_relative_path
)
bucket_name
,
key
=
parse_bucket_key
(
s3_path
)
res
=
self
.
client
.
get_object
(
Bucket
=
bucket_name
,
Key
=
key
)
body
=
res
[
"Body"
]
.
read
()
body
=
res
[
"Body"
]
.
read
()
if
mode
==
'text'
:
if
mode
==
MODE_TXT
:
data
=
body
.
decode
(
encoding
)
# Decode bytes to text
data
=
body
.
decode
(
encoding
)
# Decode bytes to text
elif
mode
==
'binary'
:
elif
mode
==
MODE_BIN
:
data
=
body
data
=
body
else
:
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
return
data
return
data
def
write
(
self
,
data
,
s3_path
,
mode
=
"text"
,
encoding
=
"utf-8"
):
def
write
(
self
,
content
,
s3_relative_path
,
mode
=
MODE_TXT
,
encoding
=
"utf-8"
):
if
mode
==
'text'
:
if
s3_relative_path
.
startswith
(
"s3://"
):
body
=
data
.
encode
(
encoding
)
# Encode text data as bytes
s3_path
=
s3_relative_path
elif
mode
==
'binary'
:
else
:
body
=
data
s3_path
=
os
.
path
.
join
(
self
.
path
,
s3_relative_path
)
if
mode
==
MODE_TXT
:
body
=
content
.
encode
(
encoding
)
# Encode text data as bytes
elif
mode
==
MODE_BIN
:
body
=
content
else
:
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
bucket_name
,
bucket_
key
=
parse_bucket_key
(
s3_path
)
bucket_name
,
key
=
parse_bucket_key
(
s3_path
)
self
.
client
.
put_object
(
Body
=
body
,
Bucket
=
bucket_name
,
Key
=
bucket_
key
)
self
.
client
.
put_object
(
Body
=
body
,
Bucket
=
bucket_name
,
Key
=
key
)
logger
.
info
(
f
"内容已写入 {s3_path} "
)
logger
.
info
(
f
"内容已写入 {s3_path} "
)
def
read_jsonl
(
self
,
path
:
str
,
byte_start
=
0
,
byte_end
=
None
,
mode
=
MODE_TXT
,
encoding
=
'utf-8'
):
if
path
.
startswith
(
"s3://"
):
s3_path
=
path
else
:
s3_path
=
os
.
path
.
join
(
self
.
path
,
path
)
bucket_name
,
key
=
parse_bucket_key
(
s3_path
)
range_header
=
f
'bytes={byte_start}-{byte_end}'
if
byte_end
else
f
'bytes={byte_start}-'
res
=
self
.
client
.
get_object
(
Bucket
=
bucket_name
,
Key
=
key
,
Range
=
range_header
)
body
=
res
[
"Body"
]
.
read
()
if
mode
==
MODE_TXT
:
data
=
body
.
decode
(
encoding
)
# Decode bytes to text
elif
mode
==
MODE_BIN
:
data
=
body
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
return
data
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
# Config the connection info
# Config the connection info
ak
=
""
ak
=
""
sk
=
""
sk
=
""
endpoint_url
=
""
endpoint_url
=
""
addressing_style
=
""
addressing_style
=
"
auto
"
bucket_name
=
""
# Create an S3ReaderWriter object
# Create an S3ReaderWriter object
s3_reader_writer
=
S3ReaderWriter
(
ak
,
sk
,
endpoint_url
,
addressing_style
)
s3_reader_writer
=
S3ReaderWriter
(
ak
,
sk
,
endpoint_url
,
addressing_style
,
"s3://bucket_name/"
)
# Write text data to S3
# Write text data to S3
text_data
=
"This is some text data"
text_data
=
"This is some text data"
s3_reader_writer
.
write
(
data
=
text_data
,
s3_
path
=
"s3://bucket_name/ebook/test/test.json"
,
mode
=
'text'
)
s3_reader_writer
.
write
(
data
=
text_data
,
s3_
relative_path
=
f
"s3://{bucket_name}/ebook/test/test.json"
,
mode
=
MODE_TXT
)
# Read text data from S3
# Read text data from S3
text_data_read
=
s3_reader_writer
.
read
(
s3_
path
=
"s3://bucket_name/ebook/test/test.json"
,
mode
=
'text'
)
text_data_read
=
s3_reader_writer
.
read
(
s3_
relative_path
=
f
"s3://{bucket_name}/ebook/test/test.json"
,
mode
=
MODE_TXT
)
logger
.
info
(
f
"Read text data from S3: {text_data_read}"
)
logger
.
info
(
f
"Read text data from S3: {text_data_read}"
)
# Write binary data to S3
# Write binary data to S3
binary_data
=
b
"This is some binary data"
binary_data
=
b
"This is some binary data"
s3_reader_writer
.
write
(
data
=
text_data
,
s3_
path
=
"s3://bucket_name/ebook/test/test2.json"
,
mode
=
'binary'
)
s3_reader_writer
.
write
(
data
=
text_data
,
s3_
relative_path
=
f
"s3://{bucket_name}/ebook/test/test.json"
,
mode
=
MODE_BIN
)
# Read binary data from S3
# Read binary data from S3
binary_data_read
=
s3_reader_writer
.
read
(
s3_path
=
"s3://bucket_name/ebook/test/test2.json"
,
mode
=
'binary'
)
binary_data_read
=
s3_reader_writer
.
read
(
s3_relative_path
=
f
"s3://{bucket_name}/ebook/test/test.json"
,
mode
=
MODE_BIN
)
logger
.
info
(
f
"Read binary data from S3: {binary_data_read}"
)
logger
.
info
(
f
"Read binary data from S3: {binary_data_read}"
)
\ No newline at end of file
# Range Read text data from S3
binary_data_read
=
s3_reader_writer
.
read_jsonl
(
path
=
f
"s3://{bucket_name}/ebook/test/test.json"
,
byte_start
=
0
,
byte_end
=
10
,
mode
=
MODE_BIN
)
logger
.
info
(
f
"Read binary data from S3: {binary_data_read}"
)
magic_pdf/libs/commons.py
View file @
90216330
...
@@ -24,7 +24,7 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
...
@@ -24,7 +24,7 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
json_dump_path
=
"s3://llm-pdf-text/json_dump/"
json_dump_path
=
"s3://llm-pdf-text/json_dump/"
s3_image_save_path
=
"s3://mllm-raw-media/pdf2md_img/"
# TODO
基础库不应该有这些存在的路径,应该在业务代码中定义
# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" #
基础库不应该有这些存在的路径,应该在业务代码中定义
def
get_top_percent_list
(
num_list
,
percent
):
def
get_top_percent_list
(
num_list
,
percent
):
...
@@ -120,29 +120,9 @@ def read_file(pdf_path: str, s3_profile):
...
@@ -120,29 +120,9 @@ def read_file(pdf_path: str, s3_profile):
return
f
.
read
()
return
f
.
read
()
def
get_docx_model_output
(
pdf_model_output
,
pdf_model_s3_profile
,
page_id
):
def
get_docx_model_output
(
pdf_model_output
,
page_id
):
if
isinstance
(
pdf_model_output
,
str
):
model_output_json_path
=
join_path
(
pdf_model_output
,
f
"page_{page_id + 1}.json"
)
# 模型输出的页面编号从1开始的
model_output_json
=
pdf_model_output
[
page_id
]
if
os
.
path
.
exists
(
model_output_json_path
):
json_from_docx
=
read_file
(
model_output_json_path
,
pdf_model_s3_profile
)
model_output_json
=
json
.
loads
(
json_from_docx
)
else
:
try
:
model_output_json_path
=
join_path
(
pdf_model_output
,
"model.json"
)
with
open
(
model_output_json_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
model_output_json
=
json
.
load
(
f
)
model_output_json
=
model_output_json
[
"doc_layout_result"
][
page_id
]
except
:
s3_model_output_json_path
=
join_path
(
pdf_model_output
,
f
"page_{page_id + 1}.json"
)
s3_model_output_json_path
=
join_path
(
pdf_model_output
,
f
"{page_id}.json"
)
#s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id }.json")
# logger.warning(f"model_output_json_path: {model_output_json_path} not found. try to load from s3: {s3_model_output_json_path}")
s
=
read_file
(
s3_model_output_json_path
,
pdf_model_s3_profile
)
return
json
.
loads
(
s
)
elif
isinstance
(
pdf_model_output
,
list
):
model_output_json
=
pdf_model_output
[
page_id
]
return
model_output_json
return
model_output_json
...
...
magic_pdf/libs/hash_utils.py
0 → 100644
View file @
90216330
import
hashlib
def
compute_md5
(
file_bytes
):
hasher
=
hashlib
.
md5
()
hasher
.
update
(
file_bytes
)
return
hasher
.
hexdigest
()
.
upper
()
def
compute_sha256
(
input_string
):
hasher
=
hashlib
.
sha256
()
# 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
input_bytes
=
input_string
.
encode
(
'utf-8'
)
hasher
.
update
(
input_bytes
)
return
hasher
.
hexdigest
()
magic_pdf/libs/pdf_image_tools.py
View file @
90216330
import
os
from
pathlib
import
Path
from
typing
import
Tuple
import
io
# from app.common.s3 import get_s3_client
from
magic_pdf.libs.commons
import
fitz
from
magic_pdf.libs.commons
import
fitz
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.commons
import
parse_bucket_key
,
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.hash_utils
import
compute_sha256
def
cut_image
(
bbox
:
Tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_parent_path
:
str
,
s3_return_path
=
None
,
img_s3_client
=
None
,
upload_switch
=
True
):
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
):
"""
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
"""
"""
# 拼接文件名
# 拼接文件名
filename
=
f
"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}.jpg"
filename
=
f
"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
# 拼接路径
image_save_path
=
join_path
(
save_parent_path
,
filename
)
s3_img_path
=
join_path
(
s3_return_path
,
filename
)
if
s3_return_path
is
not
None
else
None
# 打印图片文件名
# print(f"Saved {image_save_path}")
#检查坐标
# x_check = int(bbox[2]) - int(bbox[0])
# y_check = int(bbox[3]) - int(bbox[1])
# if x_check <= 0 or y_check <= 0:
#
# if image_save_path.startswith("s3://"):
# logger.exception(f"传入图片坐标有误,x1<x0或y1<y0,{s3_img_path}")
# return s3_img_path
# else:
# logger.exception(f"传入图片坐标有误,x1<x0或y1<y0,{image_save_path}")
# return image_save_path
# 老版本返回不带bucket的路径
img_path
=
join_path
(
return_path
,
filename
)
if
return_path
is
not
None
else
None
# 新版本生成平铺路径
img_hash256_path
=
f
"{compute_sha256(img_path)}.jpg"
# 将坐标转换为fitz.Rect对象
# 将坐标转换为fitz.Rect对象
rect
=
fitz
.
Rect
(
*
bbox
)
rect
=
fitz
.
Rect
(
*
bbox
)
...
@@ -42,39 +26,17 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
...
@@ -42,39 +26,17 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
# 截取图片
# 截取图片
pix
=
page
.
get_pixmap
(
clip
=
rect
,
matrix
=
zoom
)
pix
=
page
.
get_pixmap
(
clip
=
rect
,
matrix
=
zoom
)
if
image_save_path
.
startswith
(
"s3://"
):
byte_data
=
pix
.
tobytes
(
output
=
'jpeg'
,
jpg_quality
=
95
)
if
not
upload_switch
:
pass
imageWriter
.
write
(
data
=
byte_data
,
path
=
img_hash256_path
,
mode
=
"binary"
)
else
:
# 图片保存到s3
return
img_hash256_path
bucket_name
,
bucket_key
=
parse_bucket_key
(
image_save_path
)
# 将字节流上传到s3
byte_data
=
pix
.
tobytes
(
output
=
'jpeg'
,
jpg_quality
=
95
)
def
save_images_by_bboxes
(
page_num
:
int
,
page
:
fitz
.
Page
,
pdf_bytes_md5
:
str
,
file_obj
=
io
.
BytesIO
(
byte_data
)
image_bboxes
:
list
,
images_overlap_backup
:
list
,
table_bboxes
:
list
,
if
img_s3_client
is
not
None
:
equation_inline_bboxes
:
list
,
img_s3_client
.
upload_fileobj
(
file_obj
,
bucket_name
,
bucket_key
)
equation_interline_bboxes
:
list
,
imageWriter
)
->
dict
:
# 每个图片上传任务都创建一个新的client
# img_s3_client_once = get_s3_client(image_save_path)
# img_s3_client_once.upload_fileobj(file_obj, bucket_name, bucket_key)
else
:
logger
.
exception
(
"must input img_s3_client"
)
return
s3_img_path
else
:
# 保存图片到本地
# 先检查一下image_save_path的父目录是否存在,如果不存在,就创建
parent_dir
=
os
.
path
.
dirname
(
image_save_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
pix
.
save
(
image_save_path
,
jpg_quality
=
95
)
# 为了直接能在markdown里看,这里把地址改为相对于mardown的地址
pth
=
Path
(
image_save_path
)
image_save_path
=
f
"{pth.parent.name}/{pth.name}"
return
image_save_path
def
save_images_by_bboxes
(
book_name
:
str
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_path
:
str
,
image_bboxes
:
list
,
images_overlap_backup
:
list
,
table_bboxes
:
list
,
equation_inline_bboxes
:
list
,
equation_interline_bboxes
:
list
,
img_s3_client
)
->
dict
:
"""
"""
返回一个dict, key为bbox, 值是图片地址
返回一个dict, key为bbox, 值是图片地址
"""
"""
...
@@ -85,53 +47,30 @@ def save_images_by_bboxes(book_name: str, page_num: int, page: fitz.Page, save_p
...
@@ -85,53 +47,30 @@ def save_images_by_bboxes(book_name: str, page_num: int, page: fitz.Page, save_p
interline_eq_info
=
[]
interline_eq_info
=
[]
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
s3_return_image_path
=
join_path
(
book_name
,
"images"
)
image_save_path
=
join_path
(
save_path
,
s3_return_image_path
)
s3_return_table_path
=
join_path
(
book_name
,
"tables"
)
table_save_path
=
join_path
(
save_path
,
s3_return_table_path
)
s3_return_equations_inline_path
=
join_path
(
book_name
,
"equations_inline"
)
equation_inline_save_path
=
join_path
(
save_path
,
s3_return_equations_inline_path
)
s3_return_equation_interline_path
=
join_path
(
book_name
,
"equation_interline"
)
equation_interline_save_path
=
join_path
(
save_path
,
s3_return_equation_interline_path
)
def
return_path
(
type
):
return
join_path
(
pdf_bytes_md5
,
type
)
for
bbox
in
image_bboxes
:
for
bbox
in
image_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"image_bboxes: 错误的box, {bbox}"
)
logger
.
warning
(
f
"image_bboxes: 错误的box, {bbox}"
)
continue
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
image_save_path
,
s3_return_image_path
,
img_s3_client
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
image_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
images_overlap_backup
:
for
bbox
in
images_overlap_backup
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"images_overlap_backup: 错误的box, {bbox}"
)
logger
.
warning
(
f
"images_overlap_backup: 错误的box, {bbox}"
)
continue
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
image_save_path
,
s3_return_image_path
,
img_s3_client
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_backup_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
image_backup_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
table_bboxes
:
for
bbox
in
table_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"table_bboxes: 错误的box, {bbox}"
)
logger
.
warning
(
f
"table_bboxes: 错误的box, {bbox}"
)
continue
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
table_save_path
,
s3_return_table_path
,
img_s3_client
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
equation_inline_bboxes
:
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
\ No newline at end of file
logger
.
warning
(
f
"equation_inline_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
[:
4
],
page_num
,
page
,
equation_inline_save_path
,
s3_return_equations_inline_path
,
img_s3_client
,
upload_switch
=
False
)
inline_eq_info
.
append
({
'bbox'
:
bbox
[:
4
],
"image_path"
:
image_path
,
"latex_text"
:
bbox
[
4
]})
for
bbox
in
equation_interline_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"equation_interline_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
[:
4
],
page_num
,
page
,
equation_interline_save_path
,
s3_return_equation_interline_path
,
img_s3_client
,
upload_switch
=
False
)
interline_eq_info
.
append
({
"bbox"
:
bbox
[:
4
],
"image_path"
:
image_path
,
"latex_text"
:
bbox
[
4
]})
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
\ No newline at end of file
magic_pdf/pdf_parse_by_ocr.py
View file @
90216330
import
json
import
os
import
time
import
time
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
,
draw_text_bbox
from
magic_pdf.libs.commons
import
(
from
magic_pdf.libs.commons
import
(
read_file
,
join_path
,
fitz
,
fitz
,
get_img_s3_client
,
get_delta_time
,
get_delta_time
,
get_docx_model_output
,
get_docx_model_output
,
)
)
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.para.para_split
import
para_split
from
magic_pdf.para.para_split
import
para_split
from
magic_pdf.pre_proc.construct_page_dict
import
ocr_construct_page_component
from
magic_pdf.pre_proc.construct_page_dict
import
ocr_construct_page_component
from
magic_pdf.pre_proc.detect_footer_by_model
import
parse_footers
from
magic_pdf.pre_proc.detect_footer_by_model
import
parse_footers
...
@@ -38,38 +30,16 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
...
@@ -38,38 +30,16 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
def
parse_pdf_by_ocr
(
def
parse_pdf_by_ocr
(
pdf_bytes
,
pdf_bytes
,
pdf_model_output
,
pdf_model_output
,
save_path
,
imageWriter
,
book_name
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
start_page_id
=
0
,
start_page_id
=
0
,
end_page_id
=
None
,
end_page_id
=
None
,
debug_mode
=
False
,
debug_mode
=
False
,
):
):
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
book_name
=
sanitize_filename
(
book_name
)
md_bookname_save_path
=
""
if
debug_mode
:
save_path
=
join_path
(
save_tmp_path
,
"md"
)
pdf_local_path
=
join_path
(
save_tmp_path
,
"download-pdfs"
,
book_name
)
if
not
os
.
path
.
exists
(
os
.
path
.
dirname
(
pdf_local_path
)):
# 如果目录不存在,创建它
os
.
makedirs
(
os
.
path
.
dirname
(
pdf_local_path
))
md_bookname_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
)
if
not
os
.
path
.
exists
(
md_bookname_save_path
):
# 如果目录不存在,创建它
os
.
makedirs
(
md_bookname_save_path
)
with
open
(
pdf_local_path
+
".pdf"
,
"wb"
)
as
pdf_file
:
pdf_file
.
write
(
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
# 初始化空的pdf_info_dict
# 初始化空的pdf_info_dict
pdf_info_dict
=
{}
pdf_info_dict
=
{}
img_s3_client
=
get_img_s3_client
(
save_path
,
image_s3_config
)
start_time
=
time
.
time
()
start_time
=
time
.
time
()
...
@@ -91,16 +61,14 @@ def parse_pdf_by_ocr(
...
@@ -91,16 +61,14 @@ def parse_pdf_by_ocr(
# 获取当前页的模型数据
# 获取当前页的模型数据
ocr_page_info
=
get_docx_model_output
(
ocr_page_info
=
get_docx_model_output
(
pdf_model_output
,
p
df_model_profile
,
p
age_id
pdf_model_output
,
page_id
)
)
"""从json中获取每页的页码、页眉、页脚的bbox"""
"""从json中获取每页的页码、页眉、页脚的bbox"""
page_no_bboxes
=
parse_pageNos
(
page_id
,
page
,
ocr_page_info
)
page_no_bboxes
=
parse_pageNos
(
page_id
,
page
,
ocr_page_info
)
header_bboxes
=
parse_headers
(
page_id
,
page
,
ocr_page_info
)
header_bboxes
=
parse_headers
(
page_id
,
page
,
ocr_page_info
)
footer_bboxes
=
parse_footers
(
page_id
,
page
,
ocr_page_info
)
footer_bboxes
=
parse_footers
(
page_id
,
page
,
ocr_page_info
)
footnote_bboxes
=
parse_footnotes_by_model
(
footnote_bboxes
=
parse_footnotes_by_model
(
page_id
,
page
,
ocr_page_info
,
debug_mode
=
debug_mode
)
page_id
,
page
,
ocr_page_info
,
md_bookname_save_path
,
debug_mode
=
debug_mode
)
# 构建需要remove的bbox字典
# 构建需要remove的bbox字典
need_remove_spans_bboxes_dict
=
{
need_remove_spans_bboxes_dict
=
{
...
@@ -179,7 +147,7 @@ def parse_pdf_by_ocr(
...
@@ -179,7 +147,7 @@ def parse_pdf_by_ocr(
spans
,
dropped_spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
spans
,
dropped_spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
'''对image和table截图'''
'''对image和table截图'''
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
)
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list
=
[]
displayed_list
=
[]
...
@@ -242,16 +210,4 @@ def parse_pdf_by_ocr(
...
@@ -242,16 +210,4 @@ def parse_pdf_by_ocr(
"""分段"""
"""分段"""
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
'''在测试时,保存调试信息'''
if
debug_mode
:
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
)
with
open
(
params_file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
# drow_bbox
draw_layout_bbox
(
pdf_info_dict
,
pdf_bytes
,
md_bookname_save_path
)
draw_text_bbox
(
pdf_info_dict
,
pdf_bytes
,
md_bookname_save_path
)
return
pdf_info_dict
return
pdf_info_dict
magic_pdf/pdf_parse_by_txt.py
View file @
90216330
This diff is collapsed.
Click to expand it.
magic_pdf/pdf_parse_for_train.py
View file @
90216330
...
@@ -112,7 +112,6 @@ def parse_pdf_for_train(
...
@@ -112,7 +112,6 @@ def parse_pdf_for_train(
pdf_model_output
,
pdf_model_output
,
save_path
,
save_path
,
book_name
,
book_name
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
image_s3_config
=
None
,
start_page_id
=
0
,
start_page_id
=
0
,
end_page_id
=
None
,
end_page_id
=
None
,
...
@@ -200,7 +199,7 @@ def parse_pdf_for_train(
...
@@ -200,7 +199,7 @@ def parse_pdf_for_train(
flags
=
fitz
.
TEXTFLAGS_TEXT
,
flags
=
fitz
.
TEXTFLAGS_TEXT
,
)[
"blocks"
]
)[
"blocks"
]
model_output_json
=
get_docx_model_output
(
model_output_json
=
get_docx_model_output
(
pdf_model_output
,
p
df_model_profile
,
p
age_id
pdf_model_output
,
page_id
)
)
# 解析图片
# 解析图片
...
...
magic_pdf/pre_proc/detect_footnote.py
View file @
90216330
...
@@ -3,7 +3,7 @@ from magic_pdf.libs.commons import fitz # pyMuPDF库
...
@@ -3,7 +3,7 @@ from magic_pdf.libs.commons import fitz # pyMuPDF库
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
def
parse_footnotes_by_model
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
,
md_bookname_save_path
,
debug_mode
=
False
):
def
parse_footnotes_by_model
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
,
md_bookname_save_path
=
None
,
debug_mode
=
False
):
"""
"""
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
:param page :fitz读取的当前页的内容
:param page :fitz读取的当前页的内容
...
...
magic_pdf/pre_proc/ocr_cut_image.py
View file @
90216330
...
@@ -3,18 +3,16 @@ from magic_pdf.libs.ocr_content_type import ContentType
...
@@ -3,18 +3,16 @@ from magic_pdf.libs.ocr_content_type import ContentType
from
magic_pdf.libs.pdf_image_tools
import
cut_image
from
magic_pdf.libs.pdf_image_tools
import
cut_image
def
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
):
def
cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
):
def
s3_return_path
(
type
):
return
join_path
(
book_name
,
type
)
def
img_save
_path
(
type
):
def
return
_path
(
type
):
return
join_path
(
save_path
,
s3_return_path
(
type
)
)
return
join_path
(
pdf_bytes_md5
,
type
)
for
span
in
spans
:
for
span
in
spans
:
span_type
=
span
[
'type'
]
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Image
:
if
span_type
==
ContentType
.
Image
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'images'
),
s3_return_path
=
s3_return_path
(
'images'
),
img_s3_client
=
img_s3_client
)
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
imageWriter
=
imageWriter
)
elif
span_type
==
ContentType
.
Table
:
elif
span_type
==
ContentType
.
Table
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'tables'
),
s3_return_path
=
s3_return_path
(
'tables'
),
img_s3_client
=
img_s3_client
)
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
imageWriter
=
imageWriter
)
return
spans
return
spans
magic_pdf/spark/spark_api.py
View file @
90216330
...
@@ -12,27 +12,78 @@
...
@@ -12,27 +12,78 @@
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
"""
from
loguru
import
logger
from
magic_pdf.io
import
AbsReaderWriter
from
magic_pdf.io
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_txt
import
parse_pdf_by_txt
def
parse_txt_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
def
parse_txt_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
"""
解析文本类pdf
解析文本类pdf
"""
"""
pass
pdf_info_dict
=
parse_pdf_by_txt
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
pdf_info_dict
[
"parse_type"
]
=
"txt"
return
pdf_info_dict
def
parse_ocr_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
def
parse_ocr_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
"""
解析ocr类pdf
解析ocr类pdf
"""
"""
pass
pdf_info_dict
=
parse_pdf_by_ocr
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
pdf_info_dict
[
"parse_type"
]
=
"ocr"
return
pdf_info_dict
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
"""
ocr和文本混合的pdf,全部解析出来
ocr和文本混合的pdf,全部解析出来
"""
"""
pass
def
parse_pdf
(
method
):
\ No newline at end of file
try
:
return
method
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
except
Exception
as
e
:
logger
.
error
(
f
"{method.__name__} error: {e}"
)
return
None
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_txt
)
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"need_drop"
,
False
):
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
if
pdf_info_dict
is
None
:
raise
Exception
(
"Both parse_pdf_by_txt and parse_pdf_by_ocr failed."
)
else
:
pdf_info_dict
[
"parse_type"
]
=
"ocr"
else
:
pdf_info_dict
[
"parse_type"
]
=
"txt"
return
pdf_info_dict
def
spark_json_extractor
(
jso
:
dict
):
pass
tools/ocr_badcase.py
View file @
90216330
...
@@ -867,6 +867,7 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
...
@@ -867,6 +867,7 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
save_results
(
result_dict
,
overall_report_dict
,
badcase_file
,
overall_file
)
save_results
(
result_dict
,
overall_report_dict
,
badcase_file
,
overall_file
)
result
=
compare_edit_distance
(
base_data_path
,
overall_report_dict
)
result
=
compare_edit_distance
(
base_data_path
,
overall_report_dict
)
<<<<<<<
HEAD
if
all
([
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
]):
if
all
([
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
]):
try
:
try
:
...
@@ -874,7 +875,10 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
...
@@ -874,7 +875,10 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
upload_to_s3
(
overall_file
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
upload_to_s3
(
overall_file
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
"上传到S3时发生错误: {e}"
)
print
(
f
"上传到S3时发生错误: {e}"
)
=======
>>>>>>>
ff8f62aa3c28facc192104387f131d87978064fc
print
(
result
)
print
(
result
)
assert
result
==
1
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"主函数,执行整个评估流程。"
)
parser
=
argparse
.
ArgumentParser
(
description
=
"主函数,执行整个评估流程。"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment