Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
709a6500
Commit
709a6500
authored
Apr 15, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
中间态dict结构调整
部分函数重构
parent
969f08dd
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
210 additions
and
135 deletions
+210
-135
text_demo.py
demo/text_demo.py
+1
-1
mkcontent.py
magic_pdf/dict2md/mkcontent.py
+2
-2
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+5
-5
convert_utils.py
magic_pdf/libs/convert_utils.py
+5
-0
pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+1
-1
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+8
-1
pdf_parse_by_txt.py
magic_pdf/pdf_parse_by_txt.py
+9
-1
UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+57
-12
__init__.py
magic_pdf/pipe/__init__.py
+0
-0
base.bak
magic_pdf/spark/base.bak
+6
-0
base.py
magic_pdf/spark/base.py
+0
-39
spark_api.py
magic_pdf/spark/spark_api.py
+27
-73
user_api.py
magic_pdf/user_api.py
+89
-0
config_init_to_json.py
tools/config_init_to_json.py
+0
-0
No files found.
demo/text_demo.py
View file @
709a6500
...
...
@@ -15,7 +15,7 @@ from loguru import logger
from
magic_pdf.libs.config_reader
import
get_s3_config_dict
from
magic_pdf.pdf_parse_by_txt
import
parse_pdf_by_txt
from
magic_pdf.spark.
base
import
get_data_source
from
magic_pdf.spark.
spark_api
import
get_data_source
def
demo_parse_pdf
(
book_name
=
None
,
start_page_id
=
0
,
debug_mode
=
True
):
...
...
magic_pdf/dict2md/mkcontent.py
View file @
709a6500
...
...
@@ -228,12 +228,12 @@ def __insert_before_para(text, type, element, content_list):
logger
.
error
(
f
"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
)
def
mk_universal_format
(
p
ara_dict
:
dic
t
,
img_buket_path
):
def
mk_universal_format
(
p
df_info_list
:
lis
t
,
img_buket_path
):
"""
构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
"""
content_lst
=
[]
for
_
,
page_info
in
para_dict
.
items
()
:
for
page_info
in
pdf_info_list
:
page_lst
=
[]
# 一个page内的段落列表
para_blocks
=
page_info
.
get
(
"para_blocks"
)
pymu_raw_blocks
=
page_info
.
get
(
"preproc_blocks"
)
...
...
magic_pdf/dict2md/ocr_mkcontent.py
View file @
709a6500
...
...
@@ -69,11 +69,11 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
return
'
\n
'
.
join
(
markdown
)
def
ocr_mk_mm_markdown_with_para
(
pdf_info_
dict
:
dict
):
def
ocr_mk_mm_markdown_with_para
(
pdf_info_
list
:
list
,
img_buket_path
):
markdown
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
()
:
for
page_info
in
pdf_info_list
:
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
page_markdown
=
ocr_mk_markdown_with_para_core
(
paras_of_layout
,
"mm"
)
page_markdown
=
ocr_mk_markdown_with_para_core
(
paras_of_layout
,
"mm"
,
img_buket_path
)
markdown
.
extend
(
page_markdown
)
return
'
\n\n
'
.
join
(
markdown
)
...
...
@@ -100,7 +100,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
return
markdown_with_para_and_pagination
def
ocr_mk_markdown_with_para_core
(
paras_of_layout
,
mode
):
def
ocr_mk_markdown_with_para_core
(
paras_of_layout
,
mode
,
img_buket_path
):
page_markdown
=
[]
for
paras
in
paras_of_layout
:
for
para
in
paras
:
...
...
@@ -123,7 +123,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
if
mode
==
'mm'
:
content
=
f
"
\n

\n
"
content
=
f
"
\n

})
\n
"
elif
mode
==
'nlp'
:
pass
if
content
!=
''
:
...
...
magic_pdf/libs/convert_utils.py
0 → 100644
View file @
709a6500
def
dict_to_list
(
input_dict
):
items_list
=
[]
for
_
,
item
in
input_dict
.
items
():
items_list
.
append
(
item
)
return
items_list
magic_pdf/libs/pdf_image_tools.py
View file @
709a6500
...
...
@@ -28,7 +28,7 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
byte_data
=
pix
.
tobytes
(
output
=
'jpeg'
,
jpg_quality
=
95
)
imageWriter
.
write
(
data
=
byte_data
,
path
=
img_hash256_path
,
mode
=
"binary"
)
imageWriter
.
write
(
content
=
byte_data
,
path
=
img_hash256_path
,
mode
=
"binary"
)
return
img_hash256_path
...
...
magic_pdf/pdf_parse_by_ocr.py
View file @
709a6500
...
...
@@ -5,6 +5,7 @@ from magic_pdf.libs.commons import (
get_delta_time
,
get_docx_model_output
,
)
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.hash_utils
import
compute_md5
...
...
@@ -210,4 +211,10 @@ def parse_pdf_by_ocr(
"""分段"""
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
return
pdf_info_dict
"""dict转list"""
pdf_info_list
=
dict_to_list
(
pdf_info_dict
)
new_pdf_info_dict
=
{
"pdf_info"
:
pdf_info_list
,
}
return
new_pdf_info_dict
magic_pdf/pdf_parse_by_txt.py
View file @
709a6500
...
...
@@ -11,6 +11,7 @@ from magic_pdf.layout.bbox_sort import (
prepare_bboxes_for_layout_split
,
)
from
magic_pdf.layout.layout_sort
import
LAYOUT_UNPROC
,
get_bboxes_layout
,
get_columns_cnt_of_layout
,
sort_text_block
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.markdown_utils
import
escape_special_markdown_char
...
...
@@ -400,4 +401,11 @@ def parse_pdf_by_txt(
if
error_info
is
not
None
:
return
_deal_with_text_exception
(
error_info
)
return
pdf_info_dict
"""dict转list"""
pdf_info_list
=
dict_to_list
(
pdf_info_dict
)
new_pdf_info_dict
=
{
"pdf_info"
:
pdf_info_list
,
}
return
new_pdf_info_dict
magic_pdf/
spark
/UNIPipe.py
→
magic_pdf/
pipe
/UNIPipe.py
View file @
709a6500
import
json
from
loguru
import
logger
from
magic_pdf.dict2md.mkcontent
import
mk_universal_format
from
magic_pdf.dict2md.ocr_mkcontent
import
make_standard_format_with_para
from
magic_pdf.dict2md.mkcontent
import
mk_universal_format
,
mk_mm_markdown
from
magic_pdf.dict2md.ocr_mkcontent
import
make_standard_format_with_para
,
ocr_mk_mm_markdown_with_para
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.io.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.detect_language_from_model
import
get_language_from_model
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.
spark.spark
_api
import
parse_union_pdf
,
parse_ocr_pdf
from
magic_pdf.
user
_api
import
parse_union_pdf
,
parse_ocr_pdf
class
UNIPipe
:
def
__init__
(
self
):
pass
def
classify
(
self
,
pdf_bytes
:
bytes
)
->
str
:
@
staticmethod
def
classify
(
pdf_bytes
:
bytes
)
->
str
:
"""
根据pdf的元数据,判断是否是文本pdf,还是ocr pdf
"""
...
...
@@ -57,25 +63,64 @@ class UNIPipe:
pdf_mid_data
=
parse_ocr_pdf
(
pdf_bytes
,
jso_useful_key
[
'model_list'
],
image_writer
)
else
:
raise
Exception
(
f
"pdf type is not txt or ocr"
)
return
JsonCompressor
.
compress
(
pdf_mid_data
)
return
JsonCompressor
.
compress
_json
(
pdf_mid_data
)
def
mk_uni_format
(
self
,
pdf_mid_data
:
str
,
img_buket_path
:
str
)
->
list
:
@
staticmethod
def
mk_uni_format
(
pdf_mid_data
:
str
,
img_buket_path
:
str
)
->
list
:
"""
根据pdf类型,生成统一格式content_list
"""
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
pdf_mid_data
)
parse_type
=
pdf_mid_data
[
"_parse_type"
]
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
if
parse_type
==
"txt"
:
content_list
=
mk_universal_format
(
pdf_
mid_data
,
img_buket_path
)
content_list
=
mk_universal_format
(
pdf_
info_list
,
img_buket_path
)
elif
parse_type
==
"ocr"
:
content_list
=
make_standard_format_with_para
(
pdf_
mid_data
,
img_buket_path
)
content_list
=
make_standard_format_with_para
(
pdf_
info_list
,
img_buket_path
)
return
content_list
@
staticmethod
def
mk_markdown
(
pdf_mid_data
:
str
,
img_buket_path
:
str
)
->
list
:
"""
根据pdf类型,markdown
"""
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
pdf_mid_data
)
parse_type
=
pdf_mid_data
[
"_parse_type"
]
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
if
parse_type
==
"txt"
:
content_list
=
mk_universal_format
(
pdf_info_list
,
img_buket_path
)
md_content
=
mk_mm_markdown
(
content_list
)
elif
parse_type
==
"ocr"
:
md_content
=
ocr_mk_mm_markdown_with_para
(
pdf_info_list
,
img_buket_path
)
return
md_content
if
__name__
==
'__main__'
:
# 测试
pipe
=
UNIPipe
()
pdf_bytes
=
open
(
r"D:\project\20231108code-clean\magic_pdf\tmp\unittest\download-pdfs\数学新星网\edu_00001544.pdf"
,
"rb"
)
.
read
()
pdf_type
=
pipe
.
classify
(
pdf_bytes
)
# file_path = r"tmp/unittest/download-pdfs/数学新星网/edu_00001236.pdf"
drw
=
DiskReaderWriter
(
r"D:/project/20231108code-clean"
)
# pdf_bytes = drw.read(path=file_path, mode=AbsReaderWriter.MODE_BIN)
# pdf_type = UNIPipe.classify(pdf_bytes)
# logger.info(f"pdf_type is {pdf_type}")
pdf_file_path
=
r"linshixuqiu\25536-00.pdf"
model_file_path
=
r"linshixuqiu\25536-00.json"
pdf_bytes
=
drw
.
read
(
path
=
pdf_file_path
,
mode
=
AbsReaderWriter
.
MODE_BIN
)
model_json_txt
=
drw
.
read
(
path
=
model_file_path
,
mode
=
AbsReaderWriter
.
MODE_TXT
)
pdf_type
=
UNIPipe
.
classify
(
pdf_bytes
)
logger
.
info
(
f
"pdf_type is {pdf_type}"
)
jso_useful_key
=
{
"_pdf_type"
:
pdf_type
,
"model_list"
:
json
.
loads
(
model_json_txt
),
}
pipe
=
UNIPipe
()
write_path
=
r"D:\project\20231108code-clean\linshixuqiu\25536-00"
img_buket_path
=
"imgs"
img_writer
=
DiskReaderWriter
(
join_path
(
write_path
,
img_buket_path
))
pdf_mid_data
=
pipe
.
parse
(
pdf_bytes
,
img_writer
,
jso_useful_key
)
md_content
=
pipe
.
mk_markdown
(
pdf_mid_data
,
"imgs"
)
md_writer
=
DiskReaderWriter
(
write_path
)
md_writer
.
write
(
content
=
md_content
,
path
=
"25536-00.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
)
md_writer
.
write
(
content
=
json
.
dumps
(
JsonCompressor
.
decompress_json
(
pdf_mid_data
),
ensure_ascii
=
False
,
indent
=
4
),
path
=
"25536-00.json"
,
mode
=
AbsReaderWriter
.
MODE_TXT
)
magic_pdf/pipe/__init__.py
0 → 100644
View file @
709a6500
magic_pdf/spark/base.bak
0 → 100644
View file @
709a6500
from loguru import logger
from magic_pdf.libs.drop_reason import DropReason
magic_pdf/spark/base.py
deleted
100644 → 0
View file @
969f08dd
from
loguru
import
logger
from
magic_pdf.libs.drop_reason
import
DropReason
def
get_data_source
(
jso
:
dict
):
data_source
=
jso
.
get
(
"data_source"
)
if
data_source
is
None
:
data_source
=
jso
.
get
(
"file_source"
)
return
data_source
def
get_data_type
(
jso
:
dict
):
data_type
=
jso
.
get
(
"data_type"
)
if
data_type
is
None
:
data_type
=
jso
.
get
(
"file_type"
)
return
data_type
def
get_bookid
(
jso
:
dict
):
book_id
=
jso
.
get
(
"bookid"
)
if
book_id
is
None
:
book_id
=
jso
.
get
(
"original_file_id"
)
return
book_id
def
exception_handler
(
jso
:
dict
,
e
):
logger
.
exception
(
e
)
jso
[
"_need_drop"
]
=
True
jso
[
"_drop_reason"
]
=
DropReason
.
Exception
jso
[
"_exception"
]
=
f
"ERROR: {e}"
return
jso
def
get_bookname
(
jso
:
dict
):
data_source
=
get_data_source
(
jso
)
file_id
=
jso
.
get
(
"file_id"
)
book_name
=
f
"{data_source}/{file_id}"
return
book_name
magic_pdf/spark/spark_api.py
View file @
709a6500
"""
用户输入:
model数组,每个元素代表一个页面
pdf在s3的路径
截图保存的s3位置
然后:
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
from
loguru
import
logger
from
magic_pdf.io
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_txt
import
parse_pdf_by_txt
from
magic_pdf.libs.drop_reason
import
DropReason
def
parse_txt_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
解析文本类pdf
"""
pdf_info_dict
=
parse_pdf_by_txt
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
def
get_data_source
(
jso
:
dict
):
data_source
=
jso
.
get
(
"data_source"
)
if
data_source
is
None
:
data_source
=
jso
.
get
(
"file_source"
)
return
data_source
pdf_info_dict
[
"parse_type"
]
=
"txt"
return
pdf_info_dict
def
get_data_type
(
jso
:
dict
):
data_type
=
jso
.
get
(
"data_type"
)
if
data_type
is
None
:
data_type
=
jso
.
get
(
"file_type"
)
return
data_type
def
parse_ocr_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
解析ocr类pdf
"""
pdf_info_dict
=
parse_pdf_by_ocr
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
pdf_info_dict
[
"_parse_type"
]
=
"ocr"
return
pdf_info_dict
def
get_bookid
(
jso
:
dict
):
book_id
=
jso
.
get
(
"bookid"
)
if
book_id
is
None
:
book_id
=
jso
.
get
(
"original_file_id"
)
return
book_id
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
ocr和文本混合的pdf,全部解析出来
"""
def
parse_pdf
(
method
):
try
:
return
method
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
except
Exception
as
e
:
logger
.
error
(
f
"{method.__name__} error: {e}"
)
return
None
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_txt
)
def
exception_handler
(
jso
:
dict
,
e
):
logger
.
exception
(
e
)
jso
[
"_need_drop"
]
=
True
jso
[
"_drop_reason"
]
=
DropReason
.
Exception
jso
[
"_exception"
]
=
f
"ERROR: {e}"
return
jso
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"_need_drop"
,
False
):
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
if
pdf_info_dict
is
None
:
raise
Exception
(
"Both parse_pdf_by_txt and parse_pdf_by_ocr failed."
)
else
:
pdf_info_dict
[
"_parse_type"
]
=
"ocr"
else
:
pdf_info_dict
[
"_parse_type"
]
=
"txt"
return
pdf_info_dict
def
get_bookname
(
jso
:
dict
):
data_source
=
get_data_source
(
jso
)
file_id
=
jso
.
get
(
"file_id"
)
book_name
=
f
"{data_source}/{file_id}"
return
book_name
def
spark_json_extractor
(
jso
:
dict
)
->
dict
:
...
...
magic_pdf/user_api.py
0 → 100644
View file @
709a6500
"""
用户输入:
model数组,每个元素代表一个页面
pdf在s3的路径
截图保存的s3位置
然后:
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
from
loguru
import
logger
from
magic_pdf.io
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_txt
import
parse_pdf_by_txt
def
parse_txt_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
解析文本类pdf
"""
pdf_info_dict
=
parse_pdf_by_txt
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
pdf_info_dict
[
"parse_type"
]
=
"txt"
return
pdf_info_dict
def
parse_ocr_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
解析ocr类pdf
"""
pdf_info_dict
=
parse_pdf_by_ocr
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
pdf_info_dict
[
"_parse_type"
]
=
"ocr"
return
pdf_info_dict
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
ocr和文本混合的pdf,全部解析出来
"""
def
parse_pdf
(
method
):
try
:
return
method
(
pdf_bytes
,
pdf_models
,
imageWriter
,
start_page_id
=
start_page
,
debug_mode
=
is_debug
,
)
except
Exception
as
e
:
logger
.
error
(
f
"{method.__name__} error: {e}"
)
return
None
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_txt
)
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"_need_drop"
,
False
):
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
if
pdf_info_dict
is
None
:
raise
Exception
(
"Both parse_pdf_by_txt and parse_pdf_by_ocr failed."
)
else
:
pdf_info_dict
[
"_parse_type"
]
=
"ocr"
else
:
pdf_info_dict
[
"_parse_type"
]
=
"txt"
return
pdf_info_dict
uti
ls/config_init_to_json.py
→
too
ls/config_init_to_json.py
View file @
709a6500
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment