Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
d1c9c7dd
Unverified
Commit
d1c9c7dd
authored
Oct 10, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Oct 10, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'opendatalab:dev' into dev
parents
7f9d80fc
ea7bc620
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
145 additions
and
20 deletions
+145
-20
doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+33
-17
para_split_v3.py
magic_pdf/para/para_split_v3.py
+110
-0
pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+2
-3
No files found.
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
d1c9c7dd
...
@@ -4,6 +4,7 @@ import fitz
...
@@ -4,6 +4,7 @@ import fitz
import
numpy
as
np
import
numpy
as
np
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.clean_memory
import
clean_memory
from
magic_pdf.libs.config_reader
import
get_local_models_dir
,
get_device
,
get_table_recog_config
from
magic_pdf.libs.config_reader
import
get_local_models_dir
,
get_device
,
get_table_recog_config
from
magic_pdf.model.model_list
import
MODEL
from
magic_pdf.model.model_list
import
MODEL
import
magic_pdf.model
as
model_config
import
magic_pdf.model
as
model_config
...
@@ -23,7 +24,7 @@ def remove_duplicates_dicts(lst):
...
@@ -23,7 +24,7 @@ def remove_duplicates_dicts(lst):
return
unique_dicts
return
unique_dicts
def
load_images_from_pdf
(
pdf_bytes
:
bytes
,
dpi
=
200
)
->
list
:
def
load_images_from_pdf
(
pdf_bytes
:
bytes
,
dpi
=
200
,
start_page_id
=
0
,
end_page_id
=
None
)
->
list
:
try
:
try
:
from
PIL
import
Image
from
PIL
import
Image
except
ImportError
:
except
ImportError
:
...
@@ -32,18 +33,28 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
...
@@ -32,18 +33,28 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
images
=
[]
images
=
[]
with
fitz
.
open
(
"pdf"
,
pdf_bytes
)
as
doc
:
with
fitz
.
open
(
"pdf"
,
pdf_bytes
)
as
doc
:
pdf_page_num
=
doc
.
page_count
end_page_id
=
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
pdf_page_num
-
1
if
end_page_id
>
pdf_page_num
-
1
:
logger
.
warning
(
"end_page_id is out of range, use images length"
)
end_page_id
=
pdf_page_num
-
1
for
index
in
range
(
0
,
doc
.
page_count
):
for
index
in
range
(
0
,
doc
.
page_count
):
page
=
doc
[
index
]
if
start_page_id
<=
index
<=
end_page_id
:
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
page
=
doc
[
index
]
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
# If the width or height exceeds 9000 after scaling, do not scale further.
if
pm
.
width
>
9000
or
pm
.
height
>
9000
:
pm
=
page
.
get_pixmap
(
matrix
=
fitz
.
Matrix
(
1
,
1
),
alpha
=
False
)
# If the width or height exceeds 9000 after scaling, do not scale further.
img
=
Image
.
frombytes
(
"RGB"
,
(
pm
.
width
,
pm
.
height
),
pm
.
samples
)
if
pm
.
width
>
9000
or
pm
.
height
>
9000
:
img
=
np
.
array
(
img
)
pm
=
page
.
get_pixmap
(
matrix
=
fitz
.
Matrix
(
1
,
1
),
alpha
=
False
)
img_dict
=
{
"img"
:
img
,
"width"
:
pm
.
width
,
"height"
:
pm
.
height
}
else
:
img_dict
=
{
"img"
:
[],
"width"
:
0
,
"height"
:
0
}
img
=
Image
.
frombytes
(
"RGB"
,
(
pm
.
width
,
pm
.
height
),
pm
.
samples
)
img
=
np
.
array
(
img
)
img_dict
=
{
"img"
:
img
,
"width"
:
pm
.
width
,
"height"
:
pm
.
height
}
images
.
append
(
img_dict
)
images
.
append
(
img_dict
)
return
images
return
images
...
@@ -111,14 +122,14 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
...
@@ -111,14 +122,14 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
model_manager
=
ModelSingleton
()
model_manager
=
ModelSingleton
()
custom_model
=
model_manager
.
get_model
(
ocr
,
show_log
,
lang
)
custom_model
=
model_manager
.
get_model
(
ocr
,
show_log
,
lang
)
images
=
load_images_from_pdf
(
pdf_bytes
)
with
fitz
.
open
(
"pdf"
,
pdf_bytes
)
as
doc
:
pdf_page_num
=
doc
.
page_count
# end_page_id = end_page_id if end_page_id else len(images) - 1
end_page_id
=
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
pdf_page_num
-
1
end_page_id
=
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
len
(
images
)
-
1
if
end_page_id
>
pdf_page_num
-
1
:
logger
.
warning
(
"end_page_id is out of range, use images length"
)
end_page_id
=
pdf_page_num
-
1
if
end_page_id
>
len
(
images
)
-
1
:
images
=
load_images_from_pdf
(
pdf_bytes
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
)
logger
.
warning
(
"end_page_id is out of range, use images length"
)
end_page_id
=
len
(
images
)
-
1
model_json
=
[]
model_json
=
[]
doc_analyze_start
=
time
.
time
()
doc_analyze_start
=
time
.
time
()
...
@@ -135,6 +146,11 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
...
@@ -135,6 +146,11 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
page_dict
=
{
"layout_dets"
:
result
,
"page_info"
:
page_info
}
page_dict
=
{
"layout_dets"
:
result
,
"page_info"
:
page_info
}
model_json
.
append
(
page_dict
)
model_json
.
append
(
page_dict
)
gc_start
=
time
.
time
()
clean_memory
()
gc_time
=
round
(
time
.
time
()
-
gc_start
,
2
)
logger
.
info
(
f
"gc time: {gc_time}"
)
doc_analyze_time
=
round
(
time
.
time
()
-
doc_analyze_start
,
2
)
doc_analyze_time
=
round
(
time
.
time
()
-
doc_analyze_start
,
2
)
doc_analyze_speed
=
round
(
(
end_page_id
+
1
-
start_page_id
)
/
doc_analyze_time
,
2
)
doc_analyze_speed
=
round
(
(
end_page_id
+
1
-
start_page_id
)
/
doc_analyze_time
,
2
)
logger
.
info
(
f
"doc analyze time: {round(time.time() - doc_analyze_start, 2)},"
logger
.
info
(
f
"doc analyze time: {round(time.time() - doc_analyze_start, 2)},"
...
...
magic_pdf/para/para_split_v3.py
0 → 100644
View file @
d1c9c7dd
import
copy
from
magic_pdf.libs.Constants
import
LINES_DELETED
,
CROSS_PAGE
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
)
def
__process_blocks
(
blocks
):
result
=
[]
current_group
=
[]
for
i
in
range
(
len
(
blocks
)):
current_block
=
blocks
[
i
]
# 如果当前块是 text 类型
if
current_block
[
'type'
]
==
'text'
:
current_block
[
"bbox_fs"
]
=
copy
.
deepcopy
(
current_block
[
"bbox"
])
if
len
(
current_block
[
"lines"
])
>
0
:
current_block
[
'bbox_fs'
]
=
[
min
([
line
[
'bbox'
][
0
]
for
line
in
current_block
[
'lines'
]]),
min
([
line
[
'bbox'
][
1
]
for
line
in
current_block
[
'lines'
]]),
max
([
line
[
'bbox'
][
2
]
for
line
in
current_block
[
'lines'
]]),
max
([
line
[
'bbox'
][
3
]
for
line
in
current_block
[
'lines'
]])]
current_group
.
append
(
current_block
)
# 检查下一个块是否存在
if
i
+
1
<
len
(
blocks
):
next_block
=
blocks
[
i
+
1
]
# 如果下一个块不是 text 类型且是 title 或 interline_equation 类型
if
next_block
[
'type'
]
in
[
'title'
,
'interline_equation'
]:
result
.
append
(
current_group
)
current_group
=
[]
# 处理最后一个 group
if
current_group
:
result
.
append
(
current_group
)
return
result
def
__merge_2_blocks
(
block1
,
block2
):
if
len
(
block1
[
'lines'
])
>
0
:
first_line
=
block1
[
'lines'
][
0
]
line_height
=
first_line
[
'bbox'
][
3
]
-
first_line
[
'bbox'
][
1
]
if
abs
(
block1
[
'bbox_fs'
][
0
]
-
first_line
[
'bbox'
][
0
])
<
line_height
/
2
:
last_line
=
block2
[
'lines'
][
-
1
]
if
len
(
last_line
[
'spans'
])
>
0
:
last_span
=
last_line
[
'spans'
][
-
1
]
line_height
=
last_line
[
'bbox'
][
3
]
-
last_line
[
'bbox'
][
1
]
if
abs
(
block2
[
'bbox_fs'
][
2
]
-
last_line
[
'bbox'
][
2
])
<
line_height
and
not
last_span
[
'content'
]
.
endswith
(
LINE_STOP_FLAG
):
if
block1
[
'page_num'
]
!=
block2
[
'page_num'
]:
for
line
in
block1
[
'lines'
]:
for
span
in
line
[
'spans'
]:
span
[
CROSS_PAGE
]
=
True
block2
[
'lines'
]
.
extend
(
block1
[
'lines'
])
block1
[
'lines'
]
=
[]
block1
[
LINES_DELETED
]
=
True
return
block1
,
block2
def
__para_merge_page
(
blocks
):
page_text_blocks_groups
=
__process_blocks
(
blocks
)
for
text_blocks_group
in
page_text_blocks_groups
:
if
len
(
text_blocks_group
)
>
1
:
# 倒序遍历
for
i
in
range
(
len
(
text_blocks_group
)
-
1
,
-
1
,
-
1
):
current_block
=
text_blocks_group
[
i
]
# 检查是否有前一个块
if
i
-
1
>=
0
:
prev_block
=
text_blocks_group
[
i
-
1
]
__merge_2_blocks
(
current_block
,
prev_block
)
else
:
continue
def
para_split
(
pdf_info_dict
,
debug_mode
=
False
):
all_blocks
=
[]
for
page_num
,
page
in
pdf_info_dict
.
items
():
blocks
=
copy
.
deepcopy
(
page
[
'preproc_blocks'
])
for
block
in
blocks
:
block
[
'page_num'
]
=
page_num
all_blocks
.
extend
(
blocks
)
__para_merge_page
(
all_blocks
)
for
page_num
,
page
in
pdf_info_dict
.
items
():
page
[
'para_blocks'
]
=
[]
for
block
in
all_blocks
:
if
block
[
'page_num'
]
==
page_num
:
page
[
'para_blocks'
]
.
append
(
block
)
if
__name__
==
'__main__'
:
input_blocks
=
[
{
'type'
:
'text'
,
'content'
:
'这是第一段'
},
{
'type'
:
'text'
,
'content'
:
'这是第二段'
},
{
'type'
:
'title'
,
'content'
:
'这是一个标题'
},
{
'type'
:
'text'
,
'content'
:
'这是第三段'
},
{
'type'
:
'interline_equation'
,
'content'
:
'这是一个公式'
},
{
'type'
:
'text'
,
'content'
:
'这是第四段'
},
{
'type'
:
'image'
,
'content'
:
'这是一张图片'
},
{
'type'
:
'text'
,
'content'
:
'这是第五段'
},
{
'type'
:
'table'
,
'content'
:
'这是一张表格'
}
]
# 调用函数
for
group_index
,
group
in
enumerate
(
__process_blocks
(
input_blocks
)):
print
(
f
"Group {group_index}: {group}"
)
magic_pdf/pdf_parse_union_core_v2.py
View file @
d1c9c7dd
...
@@ -17,6 +17,7 @@ from magic_pdf.libs.hash_utils import compute_md5
...
@@ -17,6 +17,7 @@ from magic_pdf.libs.hash_utils import compute_md5
from
magic_pdf.libs.local_math
import
float_equal
from
magic_pdf.libs.local_math
import
float_equal
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.para.para_split_v3
import
para_split
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
from
magic_pdf.pre_proc.construct_page_dict
import
ocr_construct_page_component_v2
from
magic_pdf.pre_proc.construct_page_dict
import
ocr_construct_page_component_v2
from
magic_pdf.pre_proc.cut_image
import
ocr_cut_image_and_table
from
magic_pdf.pre_proc.cut_image
import
ocr_cut_image_and_table
...
@@ -435,9 +436,7 @@ def pdf_parse_union(pdf_bytes,
...
@@ -435,9 +436,7 @@ def pdf_parse_union(pdf_bytes,
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
"""分段"""
"""分段"""
# para_split(pdf_info_dict, debug_mode=debug_mode)
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
for
page_num
,
page
in
pdf_info_dict
.
items
():
page
[
'para_blocks'
]
=
page
[
'preproc_blocks'
]
"""dict转list"""
"""dict转list"""
pdf_info_list
=
dict_to_list
(
pdf_info_dict
)
pdf_info_list
=
dict_to_list
(
pdf_info_dict
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment