Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
c96aa88d
Unverified
Commit
c96aa88d
authored
Jun 18, 2024
by
myhloli
Committed by
GitHub
Jun 18, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #119 from icecraft/feat/parallel_paddle
feat: parallelize paddle
parents
084dc22a
738f9274
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
55 additions
and
49 deletions
+55
-49
doc_analyze_by_pp_structurev2.py
magic_pdf/model/doc_analyze_by_pp_structurev2.py
+55
-49
No files found.
magic_pdf/model/doc_analyze_by_pp_structurev2.py
View file @
c96aa88d
...
@@ -7,6 +7,7 @@ from PIL import Image
...
@@ -7,6 +7,7 @@ from PIL import Image
from
loguru
import
logger
from
loguru
import
logger
import
numpy
as
np
import
numpy
as
np
def
region_to_bbox
(
region
):
def
region_to_bbox
(
region
):
x0
=
region
[
0
][
0
]
x0
=
region
[
0
][
0
]
y0
=
region
[
0
][
1
]
y0
=
region
[
0
][
1
]
...
@@ -22,12 +23,14 @@ def dict_compare(d1, d2):
...
@@ -22,12 +23,14 @@ def dict_compare(d1, d2):
def
remove_duplicates_dicts
(
lst
):
def
remove_duplicates_dicts
(
lst
):
unique_dicts
=
[]
unique_dicts
=
[]
for
dict_item
in
lst
:
for
dict_item
in
lst
:
if
not
any
(
dict_compare
(
dict_item
,
existing_dict
)
for
existing_dict
in
unique_dicts
):
if
not
any
(
dict_compare
(
dict_item
,
existing_dict
)
for
existing_dict
in
unique_dicts
):
unique_dicts
.
append
(
dict_item
)
unique_dicts
.
append
(
dict_item
)
return
unique_dicts
return
unique_dicts
def
doc_analyze
(
pdf_bytes
:
bytes
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
):
ocr_engine
=
PPStructure
(
table
=
False
,
ocr
=
ocr
,
show_log
=
show_log
)
def
load_imags_from_pdf
(
pdf_bytes
:
bytes
,
dpi
=
200
):
imgs
=
[]
imgs
=
[]
with
fitz
.
open
(
"pdf"
,
pdf_bytes
)
as
doc
:
with
fitz
.
open
(
"pdf"
,
pdf_bytes
)
as
doc
:
for
index
in
range
(
0
,
doc
.
page_count
):
for
index
in
range
(
0
,
doc
.
page_count
):
...
@@ -42,23 +45,20 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
...
@@ -42,23 +45,20 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
img
=
Image
.
frombytes
(
"RGB"
,
[
pm
.
width
,
pm
.
height
],
pm
.
samples
)
img
=
Image
.
frombytes
(
"RGB"
,
[
pm
.
width
,
pm
.
height
],
pm
.
samples
)
img
=
cv2
.
cvtColor
(
np
.
array
(
img
),
cv2
.
COLOR_RGB2BGR
)
img
=
cv2
.
cvtColor
(
np
.
array
(
img
),
cv2
.
COLOR_RGB2BGR
)
img_dict
=
{
img_dict
=
{
"img"
:
img
,
"width"
:
pm
.
width
,
"height"
:
pm
.
height
}
"img"
:
img
,
"width"
:
pm
.
width
,
"height"
:
pm
.
height
}
imgs
.
append
(
img_dict
)
imgs
.
append
(
img_dict
)
model_json
=
[]
for
index
,
img_dict
in
enumerate
(
imgs
):
class
CustomPaddleModel
:
img
=
img_dict
[
'img'
]
def
__init___
(
self
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
):
page_width
=
img_dict
[
'width'
]
self
.
model
=
PPStructure
(
table
=
False
,
ocr
=
ocr
,
show_log
=
show_log
)
page_height
=
img_dict
[
'height'
]
result
=
ocr_engine
(
img
)
def
__call__
(
self
,
img
):
result
=
self
.
model
(
img
)
spans
=
[]
spans
=
[]
for
line
in
result
:
for
line
in
result
:
line
.
pop
(
'img'
)
line
.
pop
(
"img"
)
'''
"""
为paddle输出适配type no.
为paddle输出适配type no.
title: 0 # 标题
title: 0 # 标题
text: 1 # 文本
text: 1 # 文本
...
@@ -71,37 +71,38 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
...
@@ -71,37 +71,38 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
figure_caption: 4 # 图片描述
figure_caption: 4 # 图片描述
table: 5 # 表格
table: 5 # 表格
table_caption: 6 # 表格描述
table_caption: 6 # 表格描述
'''
"""
if
line
[
'type'
]
==
'title'
:
if
line
[
"type"
]
==
"title"
:
line
[
'category_id'
]
=
0
line
[
"category_id"
]
=
0
elif
line
[
'type'
]
in
[
'text'
,
'reference'
]:
elif
line
[
"type"
]
in
[
"text"
,
"reference"
]:
line
[
'category_id'
]
=
1
line
[
"category_id"
]
=
1
elif
line
[
'type'
]
==
'figure'
:
elif
line
[
"type"
]
==
"figure"
:
line
[
'category_id'
]
=
3
line
[
"category_id"
]
=
3
elif
line
[
'type'
]
==
'figure_caption'
:
elif
line
[
"type"
]
==
"figure_caption"
:
line
[
'category_id'
]
=
4
line
[
"category_id"
]
=
4
elif
line
[
'type'
]
==
'table'
:
elif
line
[
"type"
]
==
"table"
:
line
[
'category_id'
]
=
5
line
[
"category_id"
]
=
5
elif
line
[
'type'
]
==
'table_caption'
:
elif
line
[
"type"
]
==
"table_caption"
:
line
[
'category_id'
]
=
6
line
[
"category_id"
]
=
6
elif
line
[
'type'
]
==
'equation'
:
elif
line
[
"type"
]
==
"equation"
:
line
[
'category_id'
]
=
8
line
[
"category_id"
]
=
8
elif
line
[
'type'
]
in
[
'header'
,
'footer'
]:
elif
line
[
"type"
]
in
[
"header"
,
"footer"
]:
line
[
'category_id'
]
=
2
line
[
"category_id"
]
=
2
else
:
else
:
logger
.
warning
(
f
"unknown type: {line['type']}"
)
logger
.
warning
(
f
"unknown type: {line['type']}"
)
# 兼容不输出score的paddleocr版本
# 兼容不输出score的paddleocr版本
if
line
.
get
(
"score"
)
is
None
:
if
line
.
get
(
"score"
)
is
None
:
line
[
'score'
]
=
0.5
+
random
.
random
()
*
0.5
line
[
"score"
]
=
0.5
+
random
.
random
()
*
0.5
res
=
line
.
pop
(
'res'
,
None
)
res
=
line
.
pop
(
"res"
,
None
)
if
res
is
not
None
and
len
(
res
)
>
0
:
if
res
is
not
None
and
len
(
res
)
>
0
:
for
span
in
res
:
for
span
in
res
:
new_span
=
{
'category_id'
:
15
,
new_span
=
{
'bbox'
:
region_to_bbox
(
span
[
'text_region'
]),
"category_id"
:
15
,
'score'
:
span
[
'confidence'
],
"bbox"
:
region_to_bbox
(
span
[
"text_region"
]),
'text'
:
span
[
'text'
]
"score"
:
span
[
"confidence"
],
"text"
:
span
[
"text"
],
}
}
spans
.
append
(
new_span
)
spans
.
append
(
new_span
)
...
@@ -109,16 +110,21 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
...
@@ -109,16 +110,21 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
result
.
extend
(
spans
)
result
.
extend
(
spans
)
result
=
remove_duplicates_dicts
(
result
)
result
=
remove_duplicates_dicts
(
result
)
return
result
page_info
=
{
"page_no"
:
index
,
def
doc_analyze
(
pdf_bytes
:
bytes
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
):
"height"
:
page_height
,
imgs
=
load_imags_from_pdf
(
pdf_bytes
)
"width"
:
page_width
custom_paddle
=
CustomPaddleModel
()
}
page_dict
=
{
model_json
=
[]
"layout_dets"
:
result
,
for
index
,
img_dict
in
enumerate
(
imgs
):
"page_info"
:
page_info
img
=
img_dict
[
"img"
]
}
page_width
=
img_dict
[
"width"
]
page_height
=
img_dict
[
"height"
]
result
=
custom_paddle
(
img
)
page_info
=
{
"page_no"
:
index
,
"height"
:
page_height
,
"width"
:
page_width
}
page_dict
=
{
"layout_dets"
:
result
,
"page_info"
:
page_info
}
model_json
.
append
(
page_dict
)
model_json
.
append
(
page_dict
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment