Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
de60127c
Unverified
Commit
de60127c
authored
Oct 06, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Oct 06, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #690 from myhloli/dev
refactor(model): improve timing information and performance
parents
14bb5865
be1b1ae7
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
19 additions
and
10 deletions
+19
-10
doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+5
-2
pdf_extract_kit.py
magic_pdf/model/pdf_extract_kit.py
+14
-8
No files found.
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
de60127c
...
@@ -134,7 +134,10 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
...
@@ -134,7 +134,10 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
page_info
=
{
"page_no"
:
index
,
"height"
:
page_height
,
"width"
:
page_width
}
page_info
=
{
"page_no"
:
index
,
"height"
:
page_height
,
"width"
:
page_width
}
page_dict
=
{
"layout_dets"
:
result
,
"page_info"
:
page_info
}
page_dict
=
{
"layout_dets"
:
result
,
"page_info"
:
page_info
}
model_json
.
append
(
page_dict
)
model_json
.
append
(
page_dict
)
doc_analyze_cost
=
time
.
time
()
-
doc_analyze_start
logger
.
info
(
f
"doc analyze cost: {doc_analyze_cost}"
)
doc_analyze_time
=
round
(
time
.
time
()
-
doc_analyze_start
,
2
)
doc_analyze_speed
=
round
(
(
end_page_id
+
1
-
start_page_id
)
/
doc_analyze_time
,
2
)
logger
.
info
(
f
"doc analyze time: {round(time.time() - doc_analyze_start, 2)},"
f
" speed: {doc_analyze_speed} pages/second"
)
return
model_json
return
model_json
magic_pdf/model/pdf_extract_kit.py
View file @
de60127c
...
@@ -7,6 +7,7 @@ from magic_pdf.libs.clean_memory import clean_memory
...
@@ -7,6 +7,7 @@ from magic_pdf.libs.clean_memory import clean_memory
from
magic_pdf.model.model_list
import
AtomicModel
from
magic_pdf.model.model_list
import
AtomicModel
os
.
environ
[
'NO_ALBUMENTATIONS_UPDATE'
]
=
'1'
# 禁止albumentations检查更新
os
.
environ
[
'NO_ALBUMENTATIONS_UPDATE'
]
=
'1'
# 禁止albumentations检查更新
os
.
environ
[
'YOLO_VERBOSE'
]
=
'False'
# disable yolo logger
try
:
try
:
import
cv2
import
cv2
import
yaml
import
yaml
...
@@ -274,6 +275,8 @@ class CustomPEKModel:
...
@@ -274,6 +275,8 @@ class CustomPEKModel:
def
__call__
(
self
,
image
):
def
__call__
(
self
,
image
):
page_start
=
time
.
time
()
latex_filling_list
=
[]
latex_filling_list
=
[]
mf_image_list
=
[]
mf_image_list
=
[]
...
@@ -281,13 +284,15 @@ class CustomPEKModel:
...
@@ -281,13 +284,15 @@ class CustomPEKModel:
layout_start
=
time
.
time
()
layout_start
=
time
.
time
()
layout_res
=
self
.
layout_model
(
image
,
ignore_catids
=
[])
layout_res
=
self
.
layout_model
(
image
,
ignore_catids
=
[])
layout_cost
=
round
(
time
.
time
()
-
layout_start
,
2
)
layout_cost
=
round
(
time
.
time
()
-
layout_start
,
2
)
logger
.
info
(
f
"layout detection
cost
: {layout_cost}"
)
logger
.
info
(
f
"layout detection
time
: {layout_cost}"
)
pil_img
=
Image
.
fromarray
(
image
)
pil_img
=
Image
.
fromarray
(
image
)
if
self
.
apply_formula
:
if
self
.
apply_formula
:
# 公式检测
# 公式检测
mfd_start
=
time
.
time
()
mfd_res
=
self
.
mfd_model
.
predict
(
image
,
imgsz
=
1888
,
conf
=
0.25
,
iou
=
0.45
,
verbose
=
True
)[
0
]
mfd_res
=
self
.
mfd_model
.
predict
(
image
,
imgsz
=
1888
,
conf
=
0.25
,
iou
=
0.45
,
verbose
=
True
)[
0
]
logger
.
info
(
f
"mfd time: {round(time.time() - mfd_start, 2)}"
)
for
xyxy
,
conf
,
cla
in
zip
(
mfd_res
.
boxes
.
xyxy
.
cpu
(),
mfd_res
.
boxes
.
conf
.
cpu
(),
mfd_res
.
boxes
.
cls
.
cpu
()):
for
xyxy
,
conf
,
cla
in
zip
(
mfd_res
.
boxes
.
xyxy
.
cpu
(),
mfd_res
.
boxes
.
conf
.
cpu
(),
mfd_res
.
boxes
.
cls
.
cpu
()):
xmin
,
ymin
,
xmax
,
ymax
=
[
int
(
p
.
item
())
for
p
in
xyxy
]
xmin
,
ymin
,
xmax
,
ymax
=
[
int
(
p
.
item
())
for
p
in
xyxy
]
new_item
=
{
new_item
=
{
...
@@ -381,7 +386,7 @@ class CustomPEKModel:
...
@@ -381,7 +386,7 @@ class CustomPEKModel:
})
})
ocr_cost
=
round
(
time
.
time
()
-
ocr_start
,
2
)
ocr_cost
=
round
(
time
.
time
()
-
ocr_start
,
2
)
logger
.
info
(
f
"ocr
cost
: {ocr_cost}"
)
logger
.
info
(
f
"ocr
time
: {ocr_cost}"
)
# 表格识别 table recognition
# 表格识别 table recognition
if
self
.
apply_table
:
if
self
.
apply_table
:
...
@@ -389,7 +394,7 @@ class CustomPEKModel:
...
@@ -389,7 +394,7 @@ class CustomPEKModel:
for
res
in
table_res_list
:
for
res
in
table_res_list
:
new_image
,
_
=
crop_img
(
res
,
pil_img
)
new_image
,
_
=
crop_img
(
res
,
pil_img
)
single_table_start_time
=
time
.
time
()
single_table_start_time
=
time
.
time
()
logger
.
info
(
"------------------table recognition processing begins-----------------"
)
#
logger.info("------------------table recognition processing begins-----------------")
latex_code
=
None
latex_code
=
None
html_code
=
None
html_code
=
None
if
self
.
table_model_type
==
STRUCT_EQTABLE
:
if
self
.
table_model_type
==
STRUCT_EQTABLE
:
...
@@ -399,7 +404,7 @@ class CustomPEKModel:
...
@@ -399,7 +404,7 @@ class CustomPEKModel:
html_code
=
self
.
table_model
.
img2html
(
new_image
)
html_code
=
self
.
table_model
.
img2html
(
new_image
)
run_time
=
time
.
time
()
-
single_table_start_time
run_time
=
time
.
time
()
-
single_table_start_time
logger
.
info
(
f
"------------table recognition processing ends within {run_time}s-----"
)
#
logger.info(f"------------table recognition processing ends within {run_time}s-----")
if
run_time
>
self
.
table_max_time
:
if
run_time
>
self
.
table_max_time
:
logger
.
warning
(
f
"------------table recognition processing exceeds max time {self.table_max_time}s----------"
)
logger
.
warning
(
f
"------------table recognition processing exceeds max time {self.table_max_time}s----------"
)
# 判断是否返回正常
# 判断是否返回正常
...
@@ -410,12 +415,13 @@ class CustomPEKModel:
...
@@ -410,12 +415,13 @@ class CustomPEKModel:
if
expected_ending
:
if
expected_ending
:
res
[
"latex"
]
=
latex_code
res
[
"latex"
]
=
latex_code
else
:
else
:
logger
.
warning
(
f
"
------------table recognition processing fails----------
"
)
logger
.
warning
(
f
"
table recognition processing fails, not found expected LaTeX table end
"
)
elif
html_code
:
elif
html_code
:
res
[
"html"
]
=
html_code
res
[
"html"
]
=
html_code
else
:
else
:
logger
.
warning
(
f
"------------table recognition processing fails----------"
)
logger
.
warning
(
f
"table recognition processing fails, not get latex or html return"
)
table_cost
=
round
(
time
.
time
()
-
table_start
,
2
)
logger
.
info
(
f
"table time: {round(time.time() - table_start, 2)}"
)
logger
.
info
(
f
"table cost: {table_cost}"
)
logger
.
info
(
f
"-----page total time: {round(time.time() - page_start, 2)}-----"
)
return
layout_res
return
layout_res
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment