Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
90216330
Commit
90216330
authored
Apr 11, 2024
by
Shuimo
Browse files
Options
Browse Files
Download
Plain Diff
update the text_badcase script and add auto upload s3 function
parents
ad8941e3
ff8f62aa
Changes
16
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
272 additions
and
350 deletions
+272
-350
benchmark.yml
.github/workflows/benchmark.yml
+19
-6
ocr_demo.py
demo/ocr_demo.py
+4
-3
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+4
-5
AbsReaderWriter.py
magic_pdf/io/AbsReaderWriter.py
+2
-2
DiskReaderWriter.py
magic_pdf/io/DiskReaderWriter.py
+36
-21
S3ReaderWriter.py
magic_pdf/io/S3ReaderWriter.py
+58
-23
commons.py
magic_pdf/libs/commons.py
+4
-24
hash_utils.py
magic_pdf/libs/hash_utils.py
+15
-0
pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+31
-92
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+6
-50
pdf_parse_by_txt.py
magic_pdf/pdf_parse_by_txt.py
+27
-110
pdf_parse_for_train.py
magic_pdf/pdf_parse_for_train.py
+1
-2
detect_footnote.py
magic_pdf/pre_proc/detect_footnote.py
+1
-1
ocr_cut_image.py
magic_pdf/pre_proc/ocr_cut_image.py
+5
-7
spark_api.py
magic_pdf/spark/spark_api.py
+55
-4
ocr_badcase.py
tools/ocr_badcase.py
+4
-0
No files found.
.github/workflows/benchmark.yml
View file @
90216330
...
...
@@ -5,7 +5,13 @@ name: PDF
on
:
push
:
branches
:
-
master
-
"
master"
paths-ignore
:
-
"
cmds/**"
-
"
**.md"
pull_request
:
branches
:
-
"
master"
paths-ignore
:
-
"
cmds/**"
-
"
**.md"
...
...
@@ -18,14 +24,16 @@ jobs:
fail-fast
:
true
steps
:
-
name
:
config-net
run
:
|
export http_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
export https_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
-
name
:
PDF benchmark
uses
:
actions/checkout@v3
with
:
fetch-depth
:
2
-
name
:
check-requirements
run
:
|
export http_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
export https_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
changed_files=$(git diff --name-only -r HEAD~1 HEAD)
echo $changed_files
if [[ $changed_files =~ "requirements.txt" ]]; then
...
...
@@ -36,12 +44,17 @@ jobs:
-
name
:
benchmark
run
:
|
echo "start test"
cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip
output
.json
cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip
badcase.json overall.json base_data
.json
notify_to_feishu
:
if
:
${{ (github.ref_name == 'master') }}
if
:
${{
always() && !cancelled() && contains(needs.*.result, 'failure') &&
(github.ref_name == 'master') }}
needs
:
[
pdf-test
]
runs-on
:
[
pdf
]
steps
:
-
name
:
notify
run
:
|
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}
curl ${{ secrets.WEBHOOK_URL }} -H 'Content-Type: application/json' -d '{
"msgtype": "text",
"text": {
"content": "'${{ github.repository }}' GitHubAction Failed!\n 细节请查看:https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"
}
}'
demo/ocr_demo.py
View file @
90216330
...
...
@@ -115,8 +115,9 @@ def ocr_parse_pdf_core(pdf_bytes, model_output_json_list, book_name, start_page_
if
__name__
==
'__main__'
:
pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
ocr_local_parse
(
pdf_path
,
json_file_path
)
# book_name = "科数网/edu_00011318"
# ocr_online_parse(book_name)
# ocr_local_parse(pdf_path, json_file_path)
book_name
=
"数学新星网/edu_00001236"
ocr_online_parse
(
book_name
)
pass
magic_pdf/dict2md/ocr_mkcontent.py
View file @
90216330
from
magic_pdf.libs.commons
import
s3_image_save_path
,
join_path
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.ocr_content_type
import
ContentType
...
...
@@ -56,7 +55,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
if
not
span
.
get
(
'image_path'
):
continue
else
:
content
=
f
"
})"
content
=
f
""
else
:
content
=
ocr_escape_special_markdown_char
(
span
[
'content'
])
# 转义特殊符号
if
span
[
'type'
]
==
ContentType
.
InlineEquation
:
...
...
@@ -123,7 +122,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
if
mode
==
'mm'
:
content
=
f
"
\n

})
\n
"
content
=
f
"
\n

\n
"
elif
mode
==
'nlp'
:
pass
if
content
!=
''
:
...
...
@@ -195,13 +194,13 @@ def line_to_standard_format(line):
if
span
[
'type'
]
==
ContentType
.
Image
:
content
=
{
'type'
:
'image'
,
'img_path'
:
join_path
(
s3_image_save_path
,
span
[
'image_path'
])
'img_path'
:
span
[
'image_path'
]
}
return
content
elif
span
[
'type'
]
==
ContentType
.
Table
:
content
=
{
'type'
:
'table'
,
'img_path'
:
join_path
(
s3_image_save_path
,
span
[
'image_path'
])
'img_path'
:
span
[
'image_path'
]
}
return
content
else
:
...
...
magic_pdf/io/AbsReaderWriter.py
View file @
90216330
...
...
@@ -10,10 +10,10 @@ class AbsReaderWriter(ABC):
def
__init__
(
self
,
parent_path
):
# 初始化代码可以在这里添加,如果需要的话
self
.
parent_path
=
parent_path
# 对于本地目录是父目录,对于s3是会写到这个
ap
th下。
self
.
parent_path
=
parent_path
# 对于本地目录是父目录,对于s3是会写到这个
pa
th下。
@
abstractmethod
def
read
(
self
,
path
:
str
,
mode
=
"text"
):
def
read
(
self
,
path
:
str
,
mode
=
MODE_TXT
):
"""
无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
"""
...
...
magic_pdf/io/DiskReaderWriter.py
View file @
90216330
import
os
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
loguru
import
logger
MODE_TXT
=
"text"
MODE_BIN
=
"binary"
class
DiskReaderWriter
(
AbsReaderWriter
):
def
__init__
(
self
,
parent_path
,
encoding
=
'utf-8'
):
self
.
path
=
parent_path
self
.
encoding
=
encoding
def
read
(
self
,
mode
=
"text"
):
if
not
os
.
path
.
exists
(
self
.
path
):
logger
.
error
(
f
"文件 {self.path} 不存在"
)
raise
Exception
(
f
"文件 {self.path} 不存在"
)
if
mode
==
"text"
:
with
open
(
self
.
path
,
'r'
,
encoding
=
self
.
encoding
)
as
f
:
def
read
(
self
,
path
,
mode
=
MODE_TXT
):
if
os
.
path
.
isabs
(
path
):
abspath
=
path
else
:
abspath
=
os
.
path
.
join
(
self
.
path
,
path
)
if
not
os
.
path
.
exists
(
abspath
):
logger
.
error
(
f
"文件 {abspath} 不存在"
)
raise
Exception
(
f
"文件 {abspath} 不存在"
)
if
mode
==
MODE_TXT
:
with
open
(
abspath
,
'r'
,
encoding
=
self
.
encoding
)
as
f
:
return
f
.
read
()
elif
mode
==
"binary"
:
with
open
(
self
.
path
,
'rb'
)
as
f
:
elif
mode
==
MODE_BIN
:
with
open
(
abs
path
,
'rb'
)
as
f
:
return
f
.
read
()
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
def
write
(
self
,
data
,
mode
=
"text"
):
if
mode
==
"text"
:
with
open
(
self
.
path
,
'w'
,
encoding
=
self
.
encoding
)
as
f
:
f
.
write
(
data
)
logger
.
info
(
f
"内容已成功写入 {self.path}"
)
def
write
(
self
,
content
,
path
,
mode
=
MODE_TXT
):
if
os
.
path
.
isabs
(
path
):
abspath
=
path
else
:
abspath
=
os
.
path
.
join
(
self
.
path
,
path
)
if
mode
==
MODE_TXT
:
with
open
(
abspath
,
'w'
,
encoding
=
self
.
encoding
)
as
f
:
f
.
write
(
content
)
logger
.
info
(
f
"内容已成功写入 {abspath}"
)
elif
mode
==
"binary"
:
with
open
(
self
.
path
,
'wb'
)
as
f
:
f
.
write
(
data
)
logger
.
info
(
f
"内容已成功写入 {
self.
path}"
)
elif
mode
==
MODE_BIN
:
with
open
(
abs
path
,
'wb'
)
as
f
:
f
.
write
(
content
)
logger
.
info
(
f
"内容已成功写入 {
abs
path}"
)
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
def
read_jsonl
(
self
,
path
:
str
,
byte_start
=
0
,
byte_end
=
None
,
encoding
=
'utf-8'
):
return
self
.
read
(
path
)
# 使用示例
if
__name__
==
"__main__"
:
file_path
=
"example.txt"
drw
=
DiskReaderWriter
(
file_path
)
file_path
=
"
io/
example.txt"
drw
=
DiskReaderWriter
(
"D:
\
projects
\
papayfork
\
Magic-PDF
\
magic_pdf"
)
# 写入内容到文件
drw
.
write
(
b
"Hello, World!"
,
mode
=
"binary"
)
drw
.
write
(
b
"Hello, World!"
,
path
=
"io/example.txt"
,
mode
=
"binary"
)
# 从文件读取内容
content
=
drw
.
read
()
content
=
drw
.
read
(
path
=
file_path
)
if
content
:
logger
.
info
(
f
"从 {file_path} 读取的内容: {content}"
)
...
...
magic_pdf/io/S3ReaderWriter.py
View file @
90216330
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.commons
import
parse_aws_param
,
parse_bucket_key
import
boto3
from
loguru
import
logger
from
boto3.s3.transfer
import
TransferConfig
from
botocore.config
import
Config
import
os
MODE_TXT
=
"text"
MODE_BIN
=
"binary"
class
S3ReaderWriter
(
AbsReaderWriter
):
def
__init__
(
self
,
ak
:
str
,
sk
:
str
,
endpoint_url
:
str
,
addressing_style
:
str
):
def
__init__
(
self
,
ak
:
str
,
sk
:
str
,
endpoint_url
:
str
,
addressing_style
:
str
,
parent_path
:
str
):
self
.
client
=
self
.
_get_client
(
ak
,
sk
,
endpoint_url
,
addressing_style
)
self
.
path
=
parent_path
def
_get_client
(
self
,
ak
:
str
,
sk
:
str
,
endpoint_url
:
str
,
addressing_style
:
str
):
s3_client
=
boto3
.
client
(
...
...
@@ -22,51 +25,83 @@ class S3ReaderWriter(AbsReaderWriter):
retries
=
{
'max_attempts'
:
5
,
'mode'
:
'standard'
}),
)
return
s3_client
def
read
(
self
,
s3_path
,
mode
=
"text"
,
encoding
=
"utf-8"
):
bucket_name
,
bucket_key
=
parse_bucket_key
(
s3_path
)
res
=
self
.
client
.
get_object
(
Bucket
=
bucket_name
,
Key
=
bucket_key
)
def
read
(
self
,
s3_relative_path
,
mode
=
MODE_TXT
,
encoding
=
"utf-8"
):
if
s3_relative_path
.
startswith
(
"s3://"
):
s3_path
=
s3_relative_path
else
:
s3_path
=
os
.
path
.
join
(
self
.
path
,
s3_relative_path
)
bucket_name
,
key
=
parse_bucket_key
(
s3_path
)
res
=
self
.
client
.
get_object
(
Bucket
=
bucket_name
,
Key
=
key
)
body
=
res
[
"Body"
]
.
read
()
if
mode
==
'text'
:
if
mode
==
MODE_TXT
:
data
=
body
.
decode
(
encoding
)
# Decode bytes to text
elif
mode
==
'binary'
:
elif
mode
==
MODE_BIN
:
data
=
body
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
return
data
def
write
(
self
,
data
,
s3_path
,
mode
=
"text"
,
encoding
=
"utf-8"
):
if
mode
==
'text'
:
body
=
data
.
encode
(
encoding
)
# Encode text data as bytes
elif
mode
==
'binary'
:
body
=
data
def
write
(
self
,
content
,
s3_relative_path
,
mode
=
MODE_TXT
,
encoding
=
"utf-8"
):
if
s3_relative_path
.
startswith
(
"s3://"
):
s3_path
=
s3_relative_path
else
:
s3_path
=
os
.
path
.
join
(
self
.
path
,
s3_relative_path
)
if
mode
==
MODE_TXT
:
body
=
content
.
encode
(
encoding
)
# Encode text data as bytes
elif
mode
==
MODE_BIN
:
body
=
content
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
bucket_name
,
bucket_
key
=
parse_bucket_key
(
s3_path
)
self
.
client
.
put_object
(
Body
=
body
,
Bucket
=
bucket_name
,
Key
=
bucket_
key
)
bucket_name
,
key
=
parse_bucket_key
(
s3_path
)
self
.
client
.
put_object
(
Body
=
body
,
Bucket
=
bucket_name
,
Key
=
key
)
logger
.
info
(
f
"内容已写入 {s3_path} "
)
def
read_jsonl
(
self
,
path
:
str
,
byte_start
=
0
,
byte_end
=
None
,
mode
=
MODE_TXT
,
encoding
=
'utf-8'
):
if
path
.
startswith
(
"s3://"
):
s3_path
=
path
else
:
s3_path
=
os
.
path
.
join
(
self
.
path
,
path
)
bucket_name
,
key
=
parse_bucket_key
(
s3_path
)
range_header
=
f
'bytes={byte_start}-{byte_end}'
if
byte_end
else
f
'bytes={byte_start}-'
res
=
self
.
client
.
get_object
(
Bucket
=
bucket_name
,
Key
=
key
,
Range
=
range_header
)
body
=
res
[
"Body"
]
.
read
()
if
mode
==
MODE_TXT
:
data
=
body
.
decode
(
encoding
)
# Decode bytes to text
elif
mode
==
MODE_BIN
:
data
=
body
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
return
data
if
__name__
==
"__main__"
:
# Config the connection info
ak
=
""
sk
=
""
endpoint_url
=
""
addressing_style
=
""
addressing_style
=
"
auto
"
bucket_name
=
""
# Create an S3ReaderWriter object
s3_reader_writer
=
S3ReaderWriter
(
ak
,
sk
,
endpoint_url
,
addressing_style
)
s3_reader_writer
=
S3ReaderWriter
(
ak
,
sk
,
endpoint_url
,
addressing_style
,
"s3://bucket_name/"
)
# Write text data to S3
text_data
=
"This is some text data"
s3_reader_writer
.
write
(
data
=
text_data
,
s3_
path
=
"s3://bucket_name/ebook/test/test.json"
,
mode
=
'text'
)
s3_reader_writer
.
write
(
data
=
text_data
,
s3_
relative_path
=
f
"s3://{bucket_name}/ebook/test/test.json"
,
mode
=
MODE_TXT
)
# Read text data from S3
text_data_read
=
s3_reader_writer
.
read
(
s3_
path
=
"s3://bucket_name/ebook/test/test.json"
,
mode
=
'text'
)
text_data_read
=
s3_reader_writer
.
read
(
s3_
relative_path
=
f
"s3://{bucket_name}/ebook/test/test.json"
,
mode
=
MODE_TXT
)
logger
.
info
(
f
"Read text data from S3: {text_data_read}"
)
# Write binary data to S3
binary_data
=
b
"This is some binary data"
s3_reader_writer
.
write
(
data
=
text_data
,
s3_
path
=
"s3://bucket_name/ebook/test/test2.json"
,
mode
=
'binary'
)
s3_reader_writer
.
write
(
data
=
text_data
,
s3_
relative_path
=
f
"s3://{bucket_name}/ebook/test/test.json"
,
mode
=
MODE_BIN
)
# Read binary data from S3
binary_data_read
=
s3_reader_writer
.
read
(
s3_path
=
"s3://bucket_name/ebook/test/test2.json"
,
mode
=
'binary'
)
binary_data_read
=
s3_reader_writer
.
read
(
s3_relative_path
=
f
"s3://{bucket_name}/ebook/test/test.json"
,
mode
=
MODE_BIN
)
logger
.
info
(
f
"Read binary data from S3: {binary_data_read}"
)
# Range Read text data from S3
binary_data_read
=
s3_reader_writer
.
read_jsonl
(
path
=
f
"s3://{bucket_name}/ebook/test/test.json"
,
byte_start
=
0
,
byte_end
=
10
,
mode
=
MODE_BIN
)
logger
.
info
(
f
"Read binary data from S3: {binary_data_read}"
)
magic_pdf/libs/commons.py
View file @
90216330
...
...
@@ -24,7 +24,7 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
json_dump_path
=
"s3://llm-pdf-text/json_dump/"
s3_image_save_path
=
"s3://mllm-raw-media/pdf2md_img/"
# TODO
基础库不应该有这些存在的路径,应该在业务代码中定义
# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" #
基础库不应该有这些存在的路径,应该在业务代码中定义
def
get_top_percent_list
(
num_list
,
percent
):
...
...
@@ -120,28 +120,8 @@ def read_file(pdf_path: str, s3_profile):
return
f
.
read
()
def
get_docx_model_output
(
pdf_model_output
,
pdf_model_s3_profile
,
page_id
):
if
isinstance
(
pdf_model_output
,
str
):
model_output_json_path
=
join_path
(
pdf_model_output
,
f
"page_{page_id + 1}.json"
)
# 模型输出的页面编号从1开始的
if
os
.
path
.
exists
(
model_output_json_path
):
json_from_docx
=
read_file
(
model_output_json_path
,
pdf_model_s3_profile
)
model_output_json
=
json
.
loads
(
json_from_docx
)
else
:
try
:
model_output_json_path
=
join_path
(
pdf_model_output
,
"model.json"
)
with
open
(
model_output_json_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
model_output_json
=
json
.
load
(
f
)
model_output_json
=
model_output_json
[
"doc_layout_result"
][
page_id
]
except
:
s3_model_output_json_path
=
join_path
(
pdf_model_output
,
f
"page_{page_id + 1}.json"
)
s3_model_output_json_path
=
join_path
(
pdf_model_output
,
f
"{page_id}.json"
)
#s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id }.json")
# logger.warning(f"model_output_json_path: {model_output_json_path} not found. try to load from s3: {s3_model_output_json_path}")
s
=
read_file
(
s3_model_output_json_path
,
pdf_model_s3_profile
)
return
json
.
loads
(
s
)
elif
isinstance
(
pdf_model_output
,
list
):
def
get_docx_model_output
(
pdf_model_output
,
page_id
):
model_output_json
=
pdf_model_output
[
page_id
]
return
model_output_json
...
...
magic_pdf/libs/hash_utils.py
0 → 100644
View file @
90216330
import
hashlib
def
compute_md5
(
file_bytes
):
hasher
=
hashlib
.
md5
()
hasher
.
update
(
file_bytes
)
return
hasher
.
hexdigest
()
.
upper
()
def
compute_sha256
(
input_string
):
hasher
=
hashlib
.
sha256
()
# 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
input_bytes
=
input_string
.
encode
(
'utf-8'
)
hasher
.
update
(
input_bytes
)
return
hasher
.
hexdigest
()
magic_pdf/libs/pdf_image_tools.py
View file @
90216330
import
os
from
pathlib
import
Path
from
typing
import
Tuple
import
io
# from app.common.s3 import get_s3_client
from
magic_pdf.libs.commons
import
fitz
from
loguru
import
logger
from
magic_pdf.libs.commons
import
parse_bucket_key
,
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.hash_utils
import
compute_sha256
def
cut_image
(
bbox
:
Tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_parent_path
:
str
,
s3_return_path
=
None
,
img_s3_client
=
None
,
upload_switch
=
True
):
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
):
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
"""
# 拼接文件名
filename
=
f
"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}.jpg"
# 拼接路径
image_save_path
=
join_path
(
save_parent_path
,
filename
)
s3_img_path
=
join_path
(
s3_return_path
,
filename
)
if
s3_return_path
is
not
None
else
None
# 打印图片文件名
# print(f"Saved {image_save_path}")
#检查坐标
# x_check = int(bbox[2]) - int(bbox[0])
# y_check = int(bbox[3]) - int(bbox[1])
# if x_check <= 0 or y_check <= 0:
#
# if image_save_path.startswith("s3://"):
# logger.exception(f"传入图片坐标有误,x1<x0或y1<y0,{s3_img_path}")
# return s3_img_path
# else:
# logger.exception(f"传入图片坐标有误,x1<x0或y1<y0,{image_save_path}")
# return image_save_path
filename
=
f
"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
# 老版本返回不带bucket的路径
img_path
=
join_path
(
return_path
,
filename
)
if
return_path
is
not
None
else
None
# 新版本生成平铺路径
img_hash256_path
=
f
"{compute_sha256(img_path)}.jpg"
# 将坐标转换为fitz.Rect对象
rect
=
fitz
.
Rect
(
*
bbox
)
...
...
@@ -42,39 +26,17 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
# 截取图片
pix
=
page
.
get_pixmap
(
clip
=
rect
,
matrix
=
zoom
)
if
image_save_path
.
startswith
(
"s3://"
):
if
not
upload_switch
:
pass
else
:
# 图片保存到s3
bucket_name
,
bucket_key
=
parse_bucket_key
(
image_save_path
)
# 将字节流上传到s3
byte_data
=
pix
.
tobytes
(
output
=
'jpeg'
,
jpg_quality
=
95
)
file_obj
=
io
.
BytesIO
(
byte_data
)
if
img_s3_client
is
not
None
:
img_s3_client
.
upload_fileobj
(
file_obj
,
bucket_name
,
bucket_key
)
# 每个图片上传任务都创建一个新的client
# img_s3_client_once = get_s3_client(image_save_path)
# img_s3_client_once.upload_fileobj(file_obj, bucket_name, bucket_key)
else
:
logger
.
exception
(
"must input img_s3_client"
)
return
s3_img_path
else
:
# 保存图片到本地
# 先检查一下image_save_path的父目录是否存在,如果不存在,就创建
parent_dir
=
os
.
path
.
dirname
(
image_save_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
pix
.
save
(
image_save_path
,
jpg_quality
=
95
)
# 为了直接能在markdown里看,这里把地址改为相对于mardown的地址
pth
=
Path
(
image_save_path
)
image_save_path
=
f
"{pth.parent.name}/{pth.name}"
return
image_save_path
def
save_images_by_bboxes
(
book_name
:
str
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_path
:
str
,
image_bboxes
:
list
,
images_overlap_backup
:
list
,
table_bboxes
:
list
,
equation_inline_bboxes
:
list
,
equation_interline_bboxes
:
list
,
img_s3_client
)
->
dict
:
imageWriter
.
write
(
data
=
byte_data
,
path
=
img_hash256_path
,
mode
=
"binary"
)
return
img_hash256_path
def
save_images_by_bboxes
(
page_num
:
int
,
page
:
fitz
.
Page
,
pdf_bytes_md5
:
str
,
image_bboxes
:
list
,
images_overlap_backup
:
list
,
table_bboxes
:
list
,
equation_inline_bboxes
:
list
,
equation_interline_bboxes
:
list
,
imageWriter
)
->
dict
:
"""
返回一个dict, key为bbox, 值是图片地址
"""
...
...
@@ -85,53 +47,30 @@ def save_images_by_bboxes(book_name: str, page_num: int, page: fitz.Page, save_p
interline_eq_info
=
[]
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
s3_return_image_path
=
join_path
(
book_name
,
"images"
)
image_save_path
=
join_path
(
save_path
,
s3_return_image_path
)
s3_return_table_path
=
join_path
(
book_name
,
"tables"
)
table_save_path
=
join_path
(
save_path
,
s3_return_table_path
)
s3_return_equations_inline_path
=
join_path
(
book_name
,
"equations_inline"
)
equation_inline_save_path
=
join_path
(
save_path
,
s3_return_equations_inline_path
)
s3_return_equation_interline_path
=
join_path
(
book_name
,
"equation_interline"
)
equation_interline_save_path
=
join_path
(
save_path
,
s3_return_equation_interline_path
)
def
return_path
(
type
):
return
join_path
(
pdf_bytes_md5
,
type
)
for
bbox
in
image_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"image_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
image_save_path
,
s3_return_image_path
,
img_s3_client
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
images_overlap_backup
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"images_overlap_backup: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
image_save_path
,
s3_return_image_path
,
img_s3_client
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_backup_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
table_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"table_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
table_save_path
,
s3_return_table_path
,
img_s3_client
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
equation_inline_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"equation_inline_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
[:
4
],
page_num
,
page
,
equation_inline_save_path
,
s3_return_equations_inline_path
,
img_s3_client
,
upload_switch
=
False
)
inline_eq_info
.
append
({
'bbox'
:
bbox
[:
4
],
"image_path"
:
image_path
,
"latex_text"
:
bbox
[
4
]})
for
bbox
in
equation_interline_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"equation_interline_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
[:
4
],
page_num
,
page
,
equation_interline_save_path
,
s3_return_equation_interline_path
,
img_s3_client
,
upload_switch
=
False
)
interline_eq_info
.
append
({
"bbox"
:
bbox
[:
4
],
"image_path"
:
image_path
,
"latex_text"
:
bbox
[
4
]})
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
\ No newline at end of file
magic_pdf/pdf_parse_by_ocr.py
View file @
90216330
import
json
import
os
import
time
from
loguru
import
logger
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
,
draw_text_bbox
from
magic_pdf.libs.commons
import
(
read_file
,
join_path
,
fitz
,
get_img_s3_client
,
get_delta_time
,
get_docx_model_output
,
)
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.para.para_split
import
para_split
from
magic_pdf.pre_proc.construct_page_dict
import
ocr_construct_page_component
from
magic_pdf.pre_proc.detect_footer_by_model
import
parse_footers
...
...
@@ -38,38 +30,16 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
def
parse_pdf_by_ocr
(
pdf_bytes
,
pdf_model_output
,
save_path
,
book_name
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
imageWriter
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
):
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
book_name
=
sanitize_filename
(
book_name
)
md_bookname_save_path
=
""
if
debug_mode
:
save_path
=
join_path
(
save_tmp_path
,
"md"
)
pdf_local_path
=
join_path
(
save_tmp_path
,
"download-pdfs"
,
book_name
)
if
not
os
.
path
.
exists
(
os
.
path
.
dirname
(
pdf_local_path
)):
# 如果目录不存在,创建它
os
.
makedirs
(
os
.
path
.
dirname
(
pdf_local_path
))
md_bookname_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
)
if
not
os
.
path
.
exists
(
md_bookname_save_path
):
# 如果目录不存在,创建它
os
.
makedirs
(
md_bookname_save_path
)
with
open
(
pdf_local_path
+
".pdf"
,
"wb"
)
as
pdf_file
:
pdf_file
.
write
(
pdf_bytes
)
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
# 初始化空的pdf_info_dict
pdf_info_dict
=
{}
img_s3_client
=
get_img_s3_client
(
save_path
,
image_s3_config
)
start_time
=
time
.
time
()
...
...
@@ -91,16 +61,14 @@ def parse_pdf_by_ocr(
# 获取当前页的模型数据
ocr_page_info
=
get_docx_model_output
(
pdf_model_output
,
p
df_model_profile
,
p
age_id
pdf_model_output
,
page_id
)
"""从json中获取每页的页码、页眉、页脚的bbox"""
page_no_bboxes
=
parse_pageNos
(
page_id
,
page
,
ocr_page_info
)
header_bboxes
=
parse_headers
(
page_id
,
page
,
ocr_page_info
)
footer_bboxes
=
parse_footers
(
page_id
,
page
,
ocr_page_info
)
footnote_bboxes
=
parse_footnotes_by_model
(
page_id
,
page
,
ocr_page_info
,
md_bookname_save_path
,
debug_mode
=
debug_mode
)
footnote_bboxes
=
parse_footnotes_by_model
(
page_id
,
page
,
ocr_page_info
,
debug_mode
=
debug_mode
)
# 构建需要remove的bbox字典
need_remove_spans_bboxes_dict
=
{
...
...
@@ -179,7 +147,7 @@ def parse_pdf_by_ocr(
spans
,
dropped_spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
'''对image和table截图'''
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
)
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list
=
[]
...
...
@@ -242,16 +210,4 @@ def parse_pdf_by_ocr(
"""分段"""
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
'''在测试时,保存调试信息'''
if
debug_mode
:
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
)
with
open
(
params_file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
# drow_bbox
draw_layout_bbox
(
pdf_info_dict
,
pdf_bytes
,
md_bookname_save_path
)
draw_text_bbox
(
pdf_info_dict
,
pdf_bytes
,
md_bookname_save_path
)
return
pdf_info_dict
magic_pdf/pdf_parse_by_txt.py
View file @
90216330
...
...
@@ -12,6 +12,7 @@ from magic_pdf.layout.bbox_sort import (
)
from
magic_pdf.layout.layout_sort
import
LAYOUT_UNPROC
,
get_bboxes_layout
,
get_columns_cnt_of_layout
,
sort_text_block
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.markdown_utils
import
escape_special_markdown_char
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.libs.vis_utils
import
draw_bbox_on_page
,
draw_layout_bbox_on_page
...
...
@@ -73,46 +74,20 @@ paraMergeException_msg = ParaMergeException().message
def
parse_pdf_by_txt
(
pdf_bytes
,
pdf_model_output
,
save_path
,
book_name
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
imageWriter
,
start_page_id
=
0
,
end_page_id
=
None
,
junk_img_bojids
=
[],
debug_mode
=
False
,
):
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
md_bookname_save_path
=
""
book_name
=
sanitize_filename
(
book_name
)
if
debug_mode
:
save_path
=
join_path
(
save_tmp_path
,
"md"
)
pdf_local_path
=
join_path
(
save_tmp_path
,
"download-pdfs"
,
book_name
)
if
not
os
.
path
.
exists
(
os
.
path
.
dirname
(
pdf_local_path
)):
# 如果目录不存在,创建它
os
.
makedirs
(
os
.
path
.
dirname
(
pdf_local_path
))
md_bookname_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
)
if
not
os
.
path
.
exists
(
md_bookname_save_path
):
# 如果目录不存在,创建它
os
.
makedirs
(
md_bookname_save_path
)
with
open
(
pdf_local_path
+
".pdf"
,
"wb"
)
as
pdf_file
:
pdf_file
.
write
(
pdf_bytes
)
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
pdf_info_dict
=
{}
img_s3_client
=
get_img_s3_client
(
save_path
,
image_s3_config
)
# 更改函数名和参数,避免歧义
# img_s3_client = "img_s3_client" #不创建这个对象,直接用字符串占位
start_time
=
time
.
time
()
"""通过统计pdf全篇文字,识别正文字体"""
main_text_font
=
get_main_text_font
(
pdf_docs
)
end_page_id
=
end_page_id
if
end_page_id
else
len
(
pdf_docs
)
-
1
for
page_id
in
range
(
start_page_id
,
end_page_id
+
1
):
page
=
pdf_docs
[
page_id
]
...
...
@@ -128,19 +103,10 @@ def parse_pdf_by_txt(
# 对单页面非重复id的img数量做统计,如果当前页超过1500则直接return need_drop
"""
page_imgs
=
page
.
get_images
()
img_counts
=
0
for
img
in
page_imgs
:
img_bojid
=
img
[
0
]
if
img_bojid
in
junk_img_bojids
:
# 判断这个图片在不在junklist中
continue
# 如果在junklist就不用管了,跳过
else
:
recs
=
page
.
get_image_rects
(
img
,
transform
=
True
)
if
recs
:
# 如果这张图在当前页面有展示
img_counts
+=
1
if
img_counts
>=
1500
:
# 如果去除了junkimg的影响,单页img仍然超过1500的话,就排除当前pdf
logger
.
warning
(
f
"page_id: {page_id}, img_counts: {img_counts}, drop this pdf: {book_name}, drop_reason: {DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS}"
)
# 去除对junkimg的依赖,简化逻辑
if
len
(
page_imgs
)
>
1500
:
# 如果当前页超过1500张图片,直接跳过
logger
.
warning
(
f
"page_id: {page_id}, img_counts: {len(page_imgs)}, drop this pdf"
)
result
=
{
"need_drop"
:
True
,
"drop_reason"
:
DropReason
.
HIGH_COMPUTATIONAL_lOAD_BY_IMGS
}
if
not
debug_mode
:
return
result
...
...
@@ -154,10 +120,10 @@ def parse_pdf_by_txt(
"dict"
,
flags
=
fitz
.
TEXTFLAGS_TEXT
,
)[
"blocks"
]
model_output_json
=
get_docx_model_output
(
pdf_model_output
,
p
df_model_profile
,
p
age_id
)
model_output_json
=
get_docx_model_output
(
pdf_model_output
,
page_id
)
# 解析图片
image_bboxes
=
parse_images
(
page_id
,
page
,
model_output_json
,
junk_img_bojids
)
image_bboxes
=
parse_images
(
page_id
,
page
,
model_output_json
)
image_bboxes
=
fix_image_vertical
(
image_bboxes
,
text_raw_blocks
)
# 修正图片的位置
image_bboxes
=
fix_seperated_image
(
image_bboxes
)
# 合并有边重合的图片
image_bboxes
=
include_img_title
(
text_raw_blocks
,
image_bboxes
)
# 向图片上方和下方寻找title,使用规则进行匹配,暂时只支持英文规则
...
...
@@ -225,22 +191,18 @@ def parse_pdf_by_txt(
"""
==================================================================================================================================
"""
if
debug_mode
:
# debugmode截图到本地
save_path
=
join_path
(
save_tmp_path
,
"md"
)
# 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
=
save_images_by_bboxes
(
book_name
,
page_id
,
page
,
save_path
,
pdf_bytes_md5
,
image_bboxes
,
images_overlap_backup
,
table_bboxes
,
equations_inline_bboxes
,
equations_interline_bboxes
,
# 传入img_s3_client
img_s3_client
,
imageWriter
)
# 只要表格和图片的截图
""""以下进入到公式替换环节 """
...
...
@@ -253,13 +215,13 @@ def parse_pdf_by_txt(
"""去掉footnote, 从文字和图片中(先去角标再去footnote试试)"""
# 通过模型识别到的footnote
footnote_bboxes_by_model
=
parse_footnotes_by_model
(
page_id
,
page
,
model_output_json
,
md_bookname_save_path
,
debug_mode
=
debug_mode
)
footnote_bboxes_by_model
=
parse_footnotes_by_model
(
page_id
,
page
,
model_output_json
,
debug_mode
=
debug_mode
)
# 通过规则识别到的footnote
footnote_bboxes_by_rule
=
parse_footnotes_by_rule
(
remain_text_blocks
,
page_height
,
page_id
,
main_text_font
)
"""进入pdf过滤器,去掉一些不合理的pdf"""
is_good_pdf
,
err
=
pdf_filter
(
page
,
remain_text_blocks
,
table_bboxes
,
image_bboxes
)
if
not
is_good_pdf
:
logger
.
warning
(
f
"page_id: {page_id}, drop this pdf: {
book_name
}, reason: {err}"
)
logger
.
warning
(
f
"page_id: {page_id}, drop this pdf: {
pdf_bytes_md5
}, reason: {err}"
)
if
not
debug_mode
:
return
err
...
...
@@ -273,7 +235,7 @@ def parse_pdf_by_txt(
if
is_text_block_horz_overlap
:
# debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in remain_text_blocks], [], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 0)
logger
.
warning
(
f
"page_id: {page_id}, drop this pdf: {
book_name
}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}"
)
logger
.
warning
(
f
"page_id: {page_id}, drop this pdf: {
pdf_bytes_md5
}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}"
)
result
=
{
"need_drop"
:
True
,
"drop_reason"
:
DropReason
.
TEXT_BLCOK_HOR_OVERLAP
}
if
not
debug_mode
:
return
result
...
...
@@ -292,21 +254,21 @@ def parse_pdf_by_txt(
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
all_bboxes
,
page_boundry
,
page_id
)
if
len
(
remain_text_blocks
)
>
0
and
len
(
all_bboxes
)
>
0
and
len
(
layout_bboxes
)
==
0
:
logger
.
warning
(
f
"page_id: {page_id}, drop this pdf: {
book_name
}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}"
)
logger
.
warning
(
f
"page_id: {page_id}, drop this pdf: {
pdf_bytes_md5
}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}"
)
result
=
{
"need_drop"
:
True
,
"drop_reason"
:
DropReason
.
CAN_NOT_DETECT_PAGE_LAYOUT
}
if
not
debug_mode
:
return
result
"""以下去掉复杂的布局和超过2列的布局"""
if
any
([
lay
[
"layout_label"
]
==
LAYOUT_UNPROC
for
lay
in
layout_bboxes
]):
# 复杂的布局
logger
.
warning
(
f
"page_id: {page_id}, drop this pdf: {
book_name
}, reason: {DropReason.COMPLICATED_LAYOUT}"
)
logger
.
warning
(
f
"page_id: {page_id}, drop this pdf: {
pdf_bytes_md5
}, reason: {DropReason.COMPLICATED_LAYOUT}"
)
result
=
{
"need_drop"
:
True
,
"drop_reason"
:
DropReason
.
COMPLICATED_LAYOUT
}
if
not
debug_mode
:
return
result
layout_column_width
=
get_columns_cnt_of_layout
(
layout_tree
)
if
layout_column_width
>
2
:
# 去掉超过2列的布局pdf
logger
.
warning
(
f
"page_id: {page_id}, drop this pdf: {
book_name
}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}"
)
logger
.
warning
(
f
"page_id: {page_id}, drop this pdf: {
pdf_bytes_md5
}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}"
)
result
=
{
"need_drop"
:
True
,
"drop_reason"
:
DropReason
.
TOO_MANY_LAYOUT_COLUMNS
,
...
...
@@ -390,28 +352,11 @@ def parse_pdf_by_txt(
for
page_info
in
pdf_info_dict
.
values
():
is_good_pdf
,
err
=
pdf_post_filter
(
page_info
)
if
not
is_good_pdf
:
logger
.
warning
(
f
"page_id: {i}, drop this pdf: {
book_name
}, reason: {err}"
)
logger
.
warning
(
f
"page_id: {i}, drop this pdf: {
pdf_bytes_md5
}, reason: {err}"
)
if
not
debug_mode
:
return
err
i
+=
1
if
debug_mode
:
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
)
page_draw_rect_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"layout.pdf"
)
# dir_path = os.path.dirname(page_draw_rect_save_path)
# if not os.path.exists(dir_path):
# # 如果目录不存在,创建它
# os.makedirs(dir_path)
with
open
(
params_file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
# 先检测本地 page_draw_rect_save_path 是否存在,如果存在则删除
if
os
.
path
.
exists
(
page_draw_rect_save_path
):
os
.
remove
(
page_draw_rect_save_path
)
# 绘制bbox和layout到pdf
draw_bbox_on_page
(
pdf_docs
,
pdf_info_dict
,
page_draw_rect_save_path
)
draw_layout_bbox_on_page
(
pdf_docs
,
pdf_info_dict
,
header
,
footer
,
page_draw_rect_save_path
)
if
debug_mode
:
# 打印后处理阶段耗时
logger
.
info
(
f
"post_processing_time: {get_delta_time(start_time)}"
)
...
...
@@ -429,56 +374,28 @@ def parse_pdf_by_txt(
para_process_pipeline
=
ParaProcessPipeline
()
def
_deal_with_text_exception
(
error_info
):
logger
.
warning
(
f
"page_id: {page_id}, drop this pdf: {
book_name
}, reason: {error_info}"
)
logger
.
warning
(
f
"page_id: {page_id}, drop this pdf: {
pdf_bytes_md5
}, reason: {error_info}"
)
if
error_info
==
denseSingleLineBlockException_msg
:
logger
.
warning
(
f
"Drop this pdf: {
book_name
}, reason: {DropReason.DENSE_SINGLE_LINE_BLOCK}"
)
logger
.
warning
(
f
"Drop this pdf: {
pdf_bytes_md5
}, reason: {DropReason.DENSE_SINGLE_LINE_BLOCK}"
)
result
=
{
"need_drop"
:
True
,
"drop_reason"
:
DropReason
.
DENSE_SINGLE_LINE_BLOCK
}
return
result
if
error_info
==
titleDetectionException_msg
:
logger
.
warning
(
f
"Drop this pdf: {
book_name
}, reason: {DropReason.TITLE_DETECTION_FAILED}"
)
logger
.
warning
(
f
"Drop this pdf: {
pdf_bytes_md5
}, reason: {DropReason.TITLE_DETECTION_FAILED}"
)
result
=
{
"need_drop"
:
True
,
"drop_reason"
:
DropReason
.
TITLE_DETECTION_FAILED
}
return
result
elif
error_info
==
titleLevelException_msg
:
logger
.
warning
(
f
"Drop this pdf: {
book_name
}, reason: {DropReason.TITLE_LEVEL_FAILED}"
)
logger
.
warning
(
f
"Drop this pdf: {
pdf_bytes_md5
}, reason: {DropReason.TITLE_LEVEL_FAILED}"
)
result
=
{
"need_drop"
:
True
,
"drop_reason"
:
DropReason
.
TITLE_LEVEL_FAILED
}
return
result
elif
error_info
==
paraSplitException_msg
:
logger
.
warning
(
f
"Drop this pdf: {
book_name
}, reason: {DropReason.PARA_SPLIT_FAILED}"
)
logger
.
warning
(
f
"Drop this pdf: {
pdf_bytes_md5
}, reason: {DropReason.PARA_SPLIT_FAILED}"
)
result
=
{
"need_drop"
:
True
,
"drop_reason"
:
DropReason
.
PARA_SPLIT_FAILED
}
return
result
elif
error_info
==
paraMergeException_msg
:
logger
.
warning
(
f
"Drop this pdf: {
book_name
}, reason: {DropReason.PARA_MERGE_FAILED}"
)
logger
.
warning
(
f
"Drop this pdf: {
pdf_bytes_md5
}, reason: {DropReason.PARA_MERGE_FAILED}"
)
result
=
{
"need_drop"
:
True
,
"drop_reason"
:
DropReason
.
PARA_MERGE_FAILED
}
return
result
if
debug_mode
:
input_pdf_file
=
f
"{pdf_local_path}.pdf"
output_dir
=
f
"{save_path}/{book_name}"
output_pdf_file
=
f
"{output_dir}/pdf_annos.pdf"
"""
Call the para_process_pipeline function to process the pdf_info_dict.
Parameters:
para_debug_mode: str or None
If para_debug_mode is None, the para_process_pipeline will not keep any intermediate results.
If para_debug_mode is "simple", the para_process_pipeline will only keep the annos on the pdf and the final results as a json file.
If para_debug_mode is "full", the para_process_pipeline will keep all the intermediate results generated during each step.
"""
pdf_info_dict
,
error_info
=
para_process_pipeline
.
para_process_pipeline
(
pdf_info_dict
,
para_debug_mode
=
"simple"
,
input_pdf_path
=
input_pdf_file
,
output_pdf_path
=
output_pdf_file
,
)
# 打印段落处理阶段耗时
logger
.
info
(
f
"para_process_time: {get_delta_time(start_time)}"
)
# debug的时候不return drop信息
if
error_info
is
not
None
:
_deal_with_text_exception
(
error_info
)
return
pdf_info_dict
else
:
pdf_info_dict
,
error_info
=
para_process_pipeline
.
para_process_pipeline
(
pdf_info_dict
)
if
error_info
is
not
None
:
return
_deal_with_text_exception
(
error_info
)
...
...
magic_pdf/pdf_parse_for_train.py
View file @
90216330
...
...
@@ -112,7 +112,6 @@ def parse_pdf_for_train(
pdf_model_output
,
save_path
,
book_name
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
start_page_id
=
0
,
end_page_id
=
None
,
...
...
@@ -200,7 +199,7 @@ def parse_pdf_for_train(
flags
=
fitz
.
TEXTFLAGS_TEXT
,
)[
"blocks"
]
model_output_json
=
get_docx_model_output
(
pdf_model_output
,
p
df_model_profile
,
p
age_id
pdf_model_output
,
page_id
)
# 解析图片
...
...
magic_pdf/pre_proc/detect_footnote.py
View file @
90216330
...
...
@@ -3,7 +3,7 @@ from magic_pdf.libs.commons import fitz # pyMuPDF库
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
def
parse_footnotes_by_model
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
,
md_bookname_save_path
,
debug_mode
=
False
):
def
parse_footnotes_by_model
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
,
md_bookname_save_path
=
None
,
debug_mode
=
False
):
"""
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
:param page :fitz读取的当前页的内容
...
...
magic_pdf/pre_proc/ocr_cut_image.py
View file @
90216330
...
...
@@ -3,18 +3,16 @@ from magic_pdf.libs.ocr_content_type import ContentType
from
magic_pdf.libs.pdf_image_tools
import
cut_image
def
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
):
def
s3_return_path
(
type
):
return
join_path
(
book_name
,
type
)
def
cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
):
def
img_save
_path
(
type
):
return
join_path
(
save_path
,
s3_return_path
(
type
)
)
def
return
_path
(
type
):
return
join_path
(
pdf_bytes_md5
,
type
)
for
span
in
spans
:
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Image
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'images'
),
s3_return_path
=
s3_return_path
(
'images'
),
img_s3_client
=
img_s3_client
)
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
imageWriter
=
imageWriter
)
elif
span_type
==
ContentType
.
Table
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'tables'
),
s3_return_path
=
s3_return_path
(
'tables'
),
img_s3_client
=
img_s3_client
)
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
imageWriter
=
imageWriter
)
return
spans
magic_pdf/spark/spark_api.py
View file @
90216330
...
...
@@ -12,27 +12,78 @@
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
from
loguru
import
logger
from
magic_pdf.io
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_txt
import
parse_pdf_by_txt
def
parse_txt_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
解析文本类pdf
"""
pass
pdf_info_dict
=
parse_pdf_by_txt
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
pdf_info_dict
[
"parse_type"
]
=
"txt"
return
pdf_info_dict
def
parse_ocr_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
解析ocr类pdf
"""
pass
pdf_info_dict
=
parse_pdf_by_ocr
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
pdf_info_dict
[
"parse_type"
]
=
"ocr"
return
pdf_info_dict
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
ocr和文本混合的pdf,全部解析出来
"""
def
parse_pdf
(
method
):
try
:
return
method
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
except
Exception
as
e
:
logger
.
error
(
f
"{method.__name__} error: {e}"
)
return
None
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_txt
)
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"need_drop"
,
False
):
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
if
pdf_info_dict
is
None
:
raise
Exception
(
"Both parse_pdf_by_txt and parse_pdf_by_ocr failed."
)
else
:
pdf_info_dict
[
"parse_type"
]
=
"ocr"
else
:
pdf_info_dict
[
"parse_type"
]
=
"txt"
return
pdf_info_dict
def
spark_json_extractor
(
jso
:
dict
):
pass
tools/ocr_badcase.py
View file @
90216330
...
...
@@ -867,6 +867,7 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
save_results
(
result_dict
,
overall_report_dict
,
badcase_file
,
overall_file
)
result
=
compare_edit_distance
(
base_data_path
,
overall_report_dict
)
<<<<<<<
HEAD
if
all
([
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
]):
try
:
...
...
@@ -874,7 +875,10 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
upload_to_s3
(
overall_file
,
s3_bucket_name
,
s3_file_directory
,
aws_access_key
,
aws_secret_key
,
end_point_url
)
except
Exception
as
e
:
print
(
f
"上传到S3时发生错误: {e}"
)
=======
>>>>>>>
ff8f62aa3c28facc192104387f131d87978064fc
print
(
result
)
assert
result
==
1
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"主函数,执行整个评估流程。"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment