Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
a828ebd7
Unverified
Commit
a828ebd7
authored
Apr 10, 2024
by
drunkpig
Committed by
GitHub
Apr 10, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #17 from myhloli/master
实现spark_api,parse逻辑重构,切图逻辑重构
parents
049104a4
877160a7
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
153 additions
and
298 deletions
+153
-298
ocr_demo.py
demo/ocr_demo.py
+4
-3
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+4
-5
commons.py
magic_pdf/libs/commons.py
+4
-24
hash_utils.py
magic_pdf/libs/hash_utils.py
+15
-0
pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+31
-92
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+6
-50
pdf_parse_by_txt.py
magic_pdf/pdf_parse_by_txt.py
+27
-110
pdf_parse_for_train.py
magic_pdf/pdf_parse_for_train.py
+1
-2
detect_footnote.py
magic_pdf/pre_proc/detect_footnote.py
+1
-1
ocr_cut_image.py
magic_pdf/pre_proc/ocr_cut_image.py
+5
-7
spark_api.py
magic_pdf/spark/spark_api.py
+55
-4
No files found.
demo/ocr_demo.py
View file @
a828ebd7
...
@@ -115,8 +115,9 @@ def ocr_parse_pdf_core(pdf_bytes, model_output_json_list, book_name, start_page_
...
@@ -115,8 +115,9 @@ def ocr_parse_pdf_core(pdf_bytes, model_output_json_list, book_name, start_page_
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
ocr_local_parse
(
pdf_path
,
json_file_path
)
# ocr_local_parse(pdf_path, json_file_path)
# book_name = "科数网/edu_00011318"
# ocr_online_parse(book_name)
book_name
=
"数学新星网/edu_00001236"
ocr_online_parse
(
book_name
)
pass
pass
magic_pdf/dict2md/ocr_mkcontent.py
View file @
a828ebd7
from
magic_pdf.libs.commons
import
s3_image_save_path
,
join_path
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
...
@@ -56,7 +55,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
...
@@ -56,7 +55,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
if
not
span
.
get
(
'image_path'
):
if
not
span
.
get
(
'image_path'
):
continue
continue
else
:
else
:
content
=
f
"
})"
content
=
f
""
else
:
else
:
content
=
ocr_escape_special_markdown_char
(
span
[
'content'
])
# 转义特殊符号
content
=
ocr_escape_special_markdown_char
(
span
[
'content'
])
# 转义特殊符号
if
span
[
'type'
]
==
ContentType
.
InlineEquation
:
if
span
[
'type'
]
==
ContentType
.
InlineEquation
:
...
@@ -123,7 +122,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
...
@@ -123,7 +122,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
if
mode
==
'mm'
:
if
mode
==
'mm'
:
content
=
f
"
\n

})
\n
"
content
=
f
"
\n

\n
"
elif
mode
==
'nlp'
:
elif
mode
==
'nlp'
:
pass
pass
if
content
!=
''
:
if
content
!=
''
:
...
@@ -195,13 +194,13 @@ def line_to_standard_format(line):
...
@@ -195,13 +194,13 @@ def line_to_standard_format(line):
if
span
[
'type'
]
==
ContentType
.
Image
:
if
span
[
'type'
]
==
ContentType
.
Image
:
content
=
{
content
=
{
'type'
:
'image'
,
'type'
:
'image'
,
'img_path'
:
join_path
(
s3_image_save_path
,
span
[
'image_path'
])
'img_path'
:
span
[
'image_path'
]
}
}
return
content
return
content
elif
span
[
'type'
]
==
ContentType
.
Table
:
elif
span
[
'type'
]
==
ContentType
.
Table
:
content
=
{
content
=
{
'type'
:
'table'
,
'type'
:
'table'
,
'img_path'
:
join_path
(
s3_image_save_path
,
span
[
'image_path'
])
'img_path'
:
span
[
'image_path'
]
}
}
return
content
return
content
else
:
else
:
...
...
magic_pdf/libs/commons.py
View file @
a828ebd7
...
@@ -24,7 +24,7 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
...
@@ -24,7 +24,7 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
json_dump_path
=
"s3://llm-pdf-text/json_dump/"
json_dump_path
=
"s3://llm-pdf-text/json_dump/"
s3_image_save_path
=
"s3://mllm-raw-media/pdf2md_img/"
# TODO
基础库不应该有这些存在的路径,应该在业务代码中定义
# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" #
基础库不应该有这些存在的路径,应该在业务代码中定义
def
get_top_percent_list
(
num_list
,
percent
):
def
get_top_percent_list
(
num_list
,
percent
):
...
@@ -120,29 +120,9 @@ def read_file(pdf_path: str, s3_profile):
...
@@ -120,29 +120,9 @@ def read_file(pdf_path: str, s3_profile):
return
f
.
read
()
return
f
.
read
()
def
get_docx_model_output
(
pdf_model_output
,
pdf_model_s3_profile
,
page_id
):
def
get_docx_model_output
(
pdf_model_output
,
page_id
):
if
isinstance
(
pdf_model_output
,
str
):
model_output_json_path
=
join_path
(
pdf_model_output
,
f
"page_{page_id + 1}.json"
)
# 模型输出的页面编号从1开始的
model_output_json
=
pdf_model_output
[
page_id
]
if
os
.
path
.
exists
(
model_output_json_path
):
json_from_docx
=
read_file
(
model_output_json_path
,
pdf_model_s3_profile
)
model_output_json
=
json
.
loads
(
json_from_docx
)
else
:
try
:
model_output_json_path
=
join_path
(
pdf_model_output
,
"model.json"
)
with
open
(
model_output_json_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
model_output_json
=
json
.
load
(
f
)
model_output_json
=
model_output_json
[
"doc_layout_result"
][
page_id
]
except
:
s3_model_output_json_path
=
join_path
(
pdf_model_output
,
f
"page_{page_id + 1}.json"
)
s3_model_output_json_path
=
join_path
(
pdf_model_output
,
f
"{page_id}.json"
)
#s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id }.json")
# logger.warning(f"model_output_json_path: {model_output_json_path} not found. try to load from s3: {s3_model_output_json_path}")
s
=
read_file
(
s3_model_output_json_path
,
pdf_model_s3_profile
)
return
json
.
loads
(
s
)
elif
isinstance
(
pdf_model_output
,
list
):
model_output_json
=
pdf_model_output
[
page_id
]
return
model_output_json
return
model_output_json
...
...
magic_pdf/libs/hash_utils.py
0 → 100644
View file @
a828ebd7
import
hashlib
def
compute_md5
(
file_bytes
):
hasher
=
hashlib
.
md5
()
hasher
.
update
(
file_bytes
)
return
hasher
.
hexdigest
()
.
upper
()
def
compute_sha256
(
input_string
):
hasher
=
hashlib
.
sha256
()
# 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
input_bytes
=
input_string
.
encode
(
'utf-8'
)
hasher
.
update
(
input_bytes
)
return
hasher
.
hexdigest
()
magic_pdf/libs/pdf_image_tools.py
View file @
a828ebd7
import
os
from
pathlib
import
Path
from
typing
import
Tuple
import
io
# from app.common.s3 import get_s3_client
from
magic_pdf.libs.commons
import
fitz
from
magic_pdf.libs.commons
import
fitz
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.commons
import
parse_bucket_key
,
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.hash_utils
import
compute_sha256
def
cut_image
(
bbox
:
Tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_parent_path
:
str
,
s3_return_path
=
None
,
img_s3_client
=
None
,
upload_switch
=
True
):
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
):
"""
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
"""
"""
# 拼接文件名
# 拼接文件名
filename
=
f
"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}.jpg"
filename
=
f
"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
# 拼接路径
image_save_path
=
join_path
(
save_parent_path
,
filename
)
s3_img_path
=
join_path
(
s3_return_path
,
filename
)
if
s3_return_path
is
not
None
else
None
# 打印图片文件名
# print(f"Saved {image_save_path}")
#检查坐标
# x_check = int(bbox[2]) - int(bbox[0])
# y_check = int(bbox[3]) - int(bbox[1])
# if x_check <= 0 or y_check <= 0:
#
# if image_save_path.startswith("s3://"):
# logger.exception(f"传入图片坐标有误,x1<x0或y1<y0,{s3_img_path}")
# return s3_img_path
# else:
# logger.exception(f"传入图片坐标有误,x1<x0或y1<y0,{image_save_path}")
# return image_save_path
# 老版本返回不带bucket的路径
img_path
=
join_path
(
return_path
,
filename
)
if
return_path
is
not
None
else
None
# 新版本生成平铺路径
img_hash256_path
=
f
"{compute_sha256(img_path)}.jpg"
# 将坐标转换为fitz.Rect对象
# 将坐标转换为fitz.Rect对象
rect
=
fitz
.
Rect
(
*
bbox
)
rect
=
fitz
.
Rect
(
*
bbox
)
...
@@ -42,39 +26,17 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
...
@@ -42,39 +26,17 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
# 截取图片
# 截取图片
pix
=
page
.
get_pixmap
(
clip
=
rect
,
matrix
=
zoom
)
pix
=
page
.
get_pixmap
(
clip
=
rect
,
matrix
=
zoom
)
if
image_save_path
.
startswith
(
"s3://"
):
byte_data
=
pix
.
tobytes
(
output
=
'jpeg'
,
jpg_quality
=
95
)
if
not
upload_switch
:
pass
imageWriter
.
write
(
data
=
byte_data
,
path
=
img_hash256_path
,
mode
=
"binary"
)
else
:
# 图片保存到s3
return
img_hash256_path
bucket_name
,
bucket_key
=
parse_bucket_key
(
image_save_path
)
# 将字节流上传到s3
byte_data
=
pix
.
tobytes
(
output
=
'jpeg'
,
jpg_quality
=
95
)
def
save_images_by_bboxes
(
page_num
:
int
,
page
:
fitz
.
Page
,
pdf_bytes_md5
:
str
,
file_obj
=
io
.
BytesIO
(
byte_data
)
image_bboxes
:
list
,
images_overlap_backup
:
list
,
table_bboxes
:
list
,
if
img_s3_client
is
not
None
:
equation_inline_bboxes
:
list
,
img_s3_client
.
upload_fileobj
(
file_obj
,
bucket_name
,
bucket_key
)
equation_interline_bboxes
:
list
,
imageWriter
)
->
dict
:
# 每个图片上传任务都创建一个新的client
# img_s3_client_once = get_s3_client(image_save_path)
# img_s3_client_once.upload_fileobj(file_obj, bucket_name, bucket_key)
else
:
logger
.
exception
(
"must input img_s3_client"
)
return
s3_img_path
else
:
# 保存图片到本地
# 先检查一下image_save_path的父目录是否存在,如果不存在,就创建
parent_dir
=
os
.
path
.
dirname
(
image_save_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
pix
.
save
(
image_save_path
,
jpg_quality
=
95
)
# 为了直接能在markdown里看,这里把地址改为相对于mardown的地址
pth
=
Path
(
image_save_path
)
image_save_path
=
f
"{pth.parent.name}/{pth.name}"
return
image_save_path
def
save_images_by_bboxes
(
book_name
:
str
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_path
:
str
,
image_bboxes
:
list
,
images_overlap_backup
:
list
,
table_bboxes
:
list
,
equation_inline_bboxes
:
list
,
equation_interline_bboxes
:
list
,
img_s3_client
)
->
dict
:
"""
"""
返回一个dict, key为bbox, 值是图片地址
返回一个dict, key为bbox, 值是图片地址
"""
"""
...
@@ -85,53 +47,30 @@ def save_images_by_bboxes(book_name: str, page_num: int, page: fitz.Page, save_p
...
@@ -85,53 +47,30 @@ def save_images_by_bboxes(book_name: str, page_num: int, page: fitz.Page, save_p
interline_eq_info
=
[]
interline_eq_info
=
[]
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
s3_return_image_path
=
join_path
(
book_name
,
"images"
)
image_save_path
=
join_path
(
save_path
,
s3_return_image_path
)
s3_return_table_path
=
join_path
(
book_name
,
"tables"
)
table_save_path
=
join_path
(
save_path
,
s3_return_table_path
)
s3_return_equations_inline_path
=
join_path
(
book_name
,
"equations_inline"
)
equation_inline_save_path
=
join_path
(
save_path
,
s3_return_equations_inline_path
)
s3_return_equation_interline_path
=
join_path
(
book_name
,
"equation_interline"
)
equation_interline_save_path
=
join_path
(
save_path
,
s3_return_equation_interline_path
)
def
return_path
(
type
):
return
join_path
(
pdf_bytes_md5
,
type
)
for
bbox
in
image_bboxes
:
for
bbox
in
image_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"image_bboxes: 错误的box, {bbox}"
)
logger
.
warning
(
f
"image_bboxes: 错误的box, {bbox}"
)
continue
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
image_save_path
,
s3_return_image_path
,
img_s3_client
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
image_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
images_overlap_backup
:
for
bbox
in
images_overlap_backup
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"images_overlap_backup: 错误的box, {bbox}"
)
logger
.
warning
(
f
"images_overlap_backup: 错误的box, {bbox}"
)
continue
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
image_save_path
,
s3_return_image_path
,
img_s3_client
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_backup_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
image_backup_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
table_bboxes
:
for
bbox
in
table_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"table_bboxes: 错误的box, {bbox}"
)
logger
.
warning
(
f
"table_bboxes: 错误的box, {bbox}"
)
continue
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
table_save_path
,
s3_return_table_path
,
img_s3_client
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
equation_inline_bboxes
:
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
\ No newline at end of file
logger
.
warning
(
f
"equation_inline_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
[:
4
],
page_num
,
page
,
equation_inline_save_path
,
s3_return_equations_inline_path
,
img_s3_client
,
upload_switch
=
False
)
inline_eq_info
.
append
({
'bbox'
:
bbox
[:
4
],
"image_path"
:
image_path
,
"latex_text"
:
bbox
[
4
]})
for
bbox
in
equation_interline_bboxes
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"equation_interline_bboxes: 错误的box, {bbox}"
)
continue
image_path
=
cut_image
(
bbox
[:
4
],
page_num
,
page
,
equation_interline_save_path
,
s3_return_equation_interline_path
,
img_s3_client
,
upload_switch
=
False
)
interline_eq_info
.
append
({
"bbox"
:
bbox
[:
4
],
"image_path"
:
image_path
,
"latex_text"
:
bbox
[
4
]})
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
\ No newline at end of file
magic_pdf/pdf_parse_by_ocr.py
View file @
a828ebd7
import
json
import
os
import
time
import
time
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
,
draw_text_bbox
from
magic_pdf.libs.commons
import
(
from
magic_pdf.libs.commons
import
(
read_file
,
join_path
,
fitz
,
fitz
,
get_img_s3_client
,
get_delta_time
,
get_delta_time
,
get_docx_model_output
,
get_docx_model_output
,
)
)
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.para.para_split
import
para_split
from
magic_pdf.para.para_split
import
para_split
from
magic_pdf.pre_proc.construct_page_dict
import
ocr_construct_page_component
from
magic_pdf.pre_proc.construct_page_dict
import
ocr_construct_page_component
from
magic_pdf.pre_proc.detect_footer_by_model
import
parse_footers
from
magic_pdf.pre_proc.detect_footer_by_model
import
parse_footers
...
@@ -38,38 +30,16 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
...
@@ -38,38 +30,16 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
def
parse_pdf_by_ocr
(
def
parse_pdf_by_ocr
(
pdf_bytes
,
pdf_bytes
,
pdf_model_output
,
pdf_model_output
,
save_path
,
imageWriter
,
book_name
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
start_page_id
=
0
,
start_page_id
=
0
,
end_page_id
=
None
,
end_page_id
=
None
,
debug_mode
=
False
,
debug_mode
=
False
,
):
):
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
book_name
=
sanitize_filename
(
book_name
)
md_bookname_save_path
=
""
if
debug_mode
:
save_path
=
join_path
(
save_tmp_path
,
"md"
)
pdf_local_path
=
join_path
(
save_tmp_path
,
"download-pdfs"
,
book_name
)
if
not
os
.
path
.
exists
(
os
.
path
.
dirname
(
pdf_local_path
)):
# 如果目录不存在,创建它
os
.
makedirs
(
os
.
path
.
dirname
(
pdf_local_path
))
md_bookname_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
)
if
not
os
.
path
.
exists
(
md_bookname_save_path
):
# 如果目录不存在,创建它
os
.
makedirs
(
md_bookname_save_path
)
with
open
(
pdf_local_path
+
".pdf"
,
"wb"
)
as
pdf_file
:
pdf_file
.
write
(
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
# 初始化空的pdf_info_dict
# 初始化空的pdf_info_dict
pdf_info_dict
=
{}
pdf_info_dict
=
{}
img_s3_client
=
get_img_s3_client
(
save_path
,
image_s3_config
)
start_time
=
time
.
time
()
start_time
=
time
.
time
()
...
@@ -91,16 +61,14 @@ def parse_pdf_by_ocr(
...
@@ -91,16 +61,14 @@ def parse_pdf_by_ocr(
# 获取当前页的模型数据
# 获取当前页的模型数据
ocr_page_info
=
get_docx_model_output
(
ocr_page_info
=
get_docx_model_output
(
pdf_model_output
,
p
df_model_profile
,
p
age_id
pdf_model_output
,
page_id
)
)
"""从json中获取每页的页码、页眉、页脚的bbox"""
"""从json中获取每页的页码、页眉、页脚的bbox"""
page_no_bboxes
=
parse_pageNos
(
page_id
,
page
,
ocr_page_info
)
page_no_bboxes
=
parse_pageNos
(
page_id
,
page
,
ocr_page_info
)
header_bboxes
=
parse_headers
(
page_id
,
page
,
ocr_page_info
)
header_bboxes
=
parse_headers
(
page_id
,
page
,
ocr_page_info
)
footer_bboxes
=
parse_footers
(
page_id
,
page
,
ocr_page_info
)
footer_bboxes
=
parse_footers
(
page_id
,
page
,
ocr_page_info
)
footnote_bboxes
=
parse_footnotes_by_model
(
footnote_bboxes
=
parse_footnotes_by_model
(
page_id
,
page
,
ocr_page_info
,
debug_mode
=
debug_mode
)
page_id
,
page
,
ocr_page_info
,
md_bookname_save_path
,
debug_mode
=
debug_mode
)
# 构建需要remove的bbox字典
# 构建需要remove的bbox字典
need_remove_spans_bboxes_dict
=
{
need_remove_spans_bboxes_dict
=
{
...
@@ -179,7 +147,7 @@ def parse_pdf_by_ocr(
...
@@ -179,7 +147,7 @@ def parse_pdf_by_ocr(
spans
,
dropped_spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
spans
,
dropped_spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
'''对image和table截图'''
'''对image和table截图'''
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
)
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list
=
[]
displayed_list
=
[]
...
@@ -242,16 +210,4 @@ def parse_pdf_by_ocr(
...
@@ -242,16 +210,4 @@ def parse_pdf_by_ocr(
"""分段"""
"""分段"""
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
'''在测试时,保存调试信息'''
if
debug_mode
:
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
)
with
open
(
params_file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
# drow_bbox
draw_layout_bbox
(
pdf_info_dict
,
pdf_bytes
,
md_bookname_save_path
)
draw_text_bbox
(
pdf_info_dict
,
pdf_bytes
,
md_bookname_save_path
)
return
pdf_info_dict
return
pdf_info_dict
magic_pdf/pdf_parse_by_txt.py
View file @
a828ebd7
This diff is collapsed.
Click to expand it.
magic_pdf/pdf_parse_for_train.py
View file @
a828ebd7
...
@@ -112,7 +112,6 @@ def parse_pdf_for_train(
...
@@ -112,7 +112,6 @@ def parse_pdf_for_train(
pdf_model_output
,
pdf_model_output
,
save_path
,
save_path
,
book_name
,
book_name
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
image_s3_config
=
None
,
start_page_id
=
0
,
start_page_id
=
0
,
end_page_id
=
None
,
end_page_id
=
None
,
...
@@ -200,7 +199,7 @@ def parse_pdf_for_train(
...
@@ -200,7 +199,7 @@ def parse_pdf_for_train(
flags
=
fitz
.
TEXTFLAGS_TEXT
,
flags
=
fitz
.
TEXTFLAGS_TEXT
,
)[
"blocks"
]
)[
"blocks"
]
model_output_json
=
get_docx_model_output
(
model_output_json
=
get_docx_model_output
(
pdf_model_output
,
p
df_model_profile
,
p
age_id
pdf_model_output
,
page_id
)
)
# 解析图片
# 解析图片
...
...
magic_pdf/pre_proc/detect_footnote.py
View file @
a828ebd7
...
@@ -3,7 +3,7 @@ from magic_pdf.libs.commons import fitz # pyMuPDF库
...
@@ -3,7 +3,7 @@ from magic_pdf.libs.commons import fitz # pyMuPDF库
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
def
parse_footnotes_by_model
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
,
md_bookname_save_path
,
debug_mode
=
False
):
def
parse_footnotes_by_model
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
,
md_bookname_save_path
=
None
,
debug_mode
=
False
):
"""
"""
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
:param page :fitz读取的当前页的内容
:param page :fitz读取的当前页的内容
...
...
magic_pdf/pre_proc/ocr_cut_image.py
View file @
a828ebd7
...
@@ -3,18 +3,16 @@ from magic_pdf.libs.ocr_content_type import ContentType
...
@@ -3,18 +3,16 @@ from magic_pdf.libs.ocr_content_type import ContentType
from
magic_pdf.libs.pdf_image_tools
import
cut_image
from
magic_pdf.libs.pdf_image_tools
import
cut_image
def
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
):
def
cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
):
def
s3_return_path
(
type
):
return
join_path
(
book_name
,
type
)
def
img_save
_path
(
type
):
def
return
_path
(
type
):
return
join_path
(
save_path
,
s3_return_path
(
type
)
)
return
join_path
(
pdf_bytes_md5
,
type
)
for
span
in
spans
:
for
span
in
spans
:
span_type
=
span
[
'type'
]
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Image
:
if
span_type
==
ContentType
.
Image
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'images'
),
s3_return_path
=
s3_return_path
(
'images'
),
img_s3_client
=
img_s3_client
)
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
imageWriter
=
imageWriter
)
elif
span_type
==
ContentType
.
Table
:
elif
span_type
==
ContentType
.
Table
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'tables'
),
s3_return_path
=
s3_return_path
(
'tables'
),
img_s3_client
=
img_s3_client
)
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
imageWriter
=
imageWriter
)
return
spans
return
spans
magic_pdf/spark/spark_api.py
View file @
a828ebd7
...
@@ -12,27 +12,78 @@
...
@@ -12,27 +12,78 @@
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
"""
from
loguru
import
logger
from
magic_pdf.io
import
AbsReaderWriter
from
magic_pdf.io
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_txt
import
parse_pdf_by_txt
def
parse_txt_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
def
parse_txt_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
"""
解析文本类pdf
解析文本类pdf
"""
"""
pass
pdf_info_dict
=
parse_pdf_by_txt
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
pdf_info_dict
[
"parse_type"
]
=
"txt"
return
pdf_info_dict
def
parse_ocr_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
def
parse_ocr_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
"""
解析ocr类pdf
解析ocr类pdf
"""
"""
pass
pdf_info_dict
=
parse_pdf_by_ocr
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
pdf_info_dict
[
"parse_type"
]
=
"ocr"
return
pdf_info_dict
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
"""
ocr和文本混合的pdf,全部解析出来
ocr和文本混合的pdf,全部解析出来
"""
"""
pass
def
parse_pdf
(
method
):
\ No newline at end of file
try
:
return
method
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
except
Exception
as
e
:
logger
.
error
(
f
"{method.__name__} error: {e}"
)
return
None
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_txt
)
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"need_drop"
,
False
):
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
if
pdf_info_dict
is
None
:
raise
Exception
(
"Both parse_pdf_by_txt and parse_pdf_by_ocr failed."
)
else
:
pdf_info_dict
[
"parse_type"
]
=
"ocr"
else
:
pdf_info_dict
[
"parse_type"
]
=
"txt"
return
pdf_info_dict
def
spark_json_extractor
(
jso
:
dict
):
pass
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment