Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
8b275ea2
Commit
8b275ea2
authored
Sep 09, 2024
by
houlinfeng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix: mineru_web
parent
2730b96b
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
52 additions
and
38 deletions
+52
-38
__init__.py
projects/web_api/web_api/api/analysis/__init__.py
+1
-1
analysis_view.py
projects/web_api/web_api/api/analysis/analysis_view.py
+16
-10
ext.py
projects/web_api/web_api/api/analysis/ext.py
+2
-2
models.py
projects/web_api/web_api/api/analysis/models.py
+1
-1
pdf_ext.py
projects/web_api/web_api/api/analysis/pdf_ext.py
+3
-2
task_view.py
projects/web_api/web_api/api/analysis/task_view.py
+25
-18
upload_view.py
projects/web_api/web_api/api/analysis/upload_view.py
+2
-2
ext.py
projects/web_api/web_api/common/ext.py
+2
-2
mineru_web.db
projects/web_api/web_api/config/mineru_web.db
+0
-0
No files found.
projects/web_api/web_api/api/analysis/__init__.py
View file @
8b275ea2
...
...
@@ -15,4 +15,4 @@ api_v2.add_resource(ImgView, '/analysis/pdf_img')
api_v2
.
add_resource
(
MdView
,
'/analysis/pdf_md'
)
api_v2
.
add_resource
(
TaskView
,
'/extract/taskQueue'
)
api_v2
.
add_resource
(
HistoricalTasksView
,
'/extract/list'
)
api_v2
.
add_resource
(
DeleteTaskView
,
'/extract/task'
)
\ No newline at end of file
api_v2
.
add_resource
(
DeleteTaskView
,
'/extract/task/<int:id>'
)
\ No newline at end of file
projects/web_api/web_api/api/analysis/analysis_view.py
View file @
8b275ea2
...
...
@@ -29,12 +29,14 @@ class AnalysisTaskProgressView(Resource):
case
'pdf'
:
analysis_pdf
=
AnalysisPdf
.
query
.
filter
(
AnalysisPdf
.
id
==
analysis_task
.
analysis_pdf_id
)
.
first
()
file_url
=
url_for
(
'analysis.uploadpdfview'
,
filename
=
analysis_task
.
file_name
,
as_attachment
=
False
)
file_name_split
=
analysis_task
.
file_name
.
split
(
"_"
)
file_name
=
file_name_split
[
-
1
]
if
file_name_split
else
analysis_task
.
file_name
if
analysis_task
.
status
==
0
:
data
=
{
"state"
:
task_state_map
.
get
(
analysis_task
.
status
),
"status"
:
analysis_pdf
.
status
,
"url"
:
file_url
,
"fileName"
:
analysis_task
.
file_name
,
"fileName"
:
file_name
,
"content"
:
[],
"markdownUrl"
:
[],
"fullMdLink"
:
""
,
...
...
@@ -47,10 +49,10 @@ class AnalysisTaskProgressView(Resource):
md_link_list
=
json
.
loads
(
analysis_pdf
.
md_link_list
)
full_md_link
=
analysis_pdf
.
full_md_link
data
=
{
"state"
:
task_state_map
.
get
(
analysis_task
.
status
)
,
"state"
:
"failed"
,
"status"
:
analysis_pdf
.
status
,
"url"
:
file_url
,
"fileName"
:
analysis_task
.
file_name
,
"fileName"
:
file_name
,
"content"
:
bbox_info
,
"markdownUrl"
:
md_link_list
,
"fullMdLink"
:
full_md_link
,
...
...
@@ -62,7 +64,7 @@ class AnalysisTaskProgressView(Resource):
"state"
:
task_state_map
.
get
(
analysis_task
.
status
),
"status"
:
analysis_pdf
.
status
,
"url"
:
file_url
,
"fileName"
:
analysis_task
.
file_name
,
"fileName"
:
file_name
,
"content"
:
[],
"markdownUrl"
:
[],
"fullMdLink"
:
""
,
...
...
@@ -75,7 +77,7 @@ class AnalysisTaskProgressView(Resource):
"state"
:
task_state_map
.
get
(
analysis_task
.
status
),
"status"
:
analysis_pdf
.
status
,
"url"
:
file_url
,
"fileName"
:
analysis_task
.
file_name
,
"fileName"
:
file_name
,
"content"
:
[],
"markdownUrl"
:
[],
"fullMdLink"
:
""
,
...
...
@@ -83,13 +85,13 @@ class AnalysisTaskProgressView(Resource):
}
return
generate_response
(
data
=
data
)
case
'formula-detect'
:
pass
return
generate_response
(
code
=
400
,
msg
=
"Not yet supported"
,
msgZH
=
"功能待开发"
)
case
'formula-extract'
:
pass
return
generate_response
(
code
=
400
,
msg
=
"Not yet supported"
,
msgZH
=
"功能待开发"
)
case
'table-recogn'
:
return
generate_response
(
code
=
400
,
msg
=
"Not yet supported"
,
msgZH
=
"
尚不支持
"
)
return
generate_response
(
code
=
400
,
msg
=
"Not yet supported"
,
msgZH
=
"
功能待开发
"
)
case
_
:
return
generate_response
()
return
generate_response
(
code
=
400
,
msg
=
"Not yet supported"
,
msgZH
=
"参数不支持"
)
class
AnalysisTaskView
(
Resource
):
...
...
@@ -181,6 +183,8 @@ class AnalysisTaskView(Resource):
params
=
json
.
loads
(
request
.
data
)
id
=
params
.
get
(
'id'
)
analysis_task
=
AnalysisTask
.
query
.
filter
(
AnalysisTask
.
id
==
id
)
.
first
()
if
not
analysis_task
:
return
generate_response
(
code
=
400
,
msg
=
"Invalid ID"
,
msgZH
=
"无效id"
)
match
analysis_task
.
task_type
:
case
'pdf'
:
task_r_p
=
AnalysisTask
.
query
.
filter
(
AnalysisTask
.
status
.
in_
([
0
,
2
]))
.
first
()
...
...
@@ -215,9 +219,11 @@ class AnalysisTaskView(Resource):
# 生成文件的URL路径
file_url
=
url_for
(
'analysis.uploadpdfview'
,
filename
=
analysis_task
.
file_name
,
as_attachment
=
False
)
file_name_split
=
analysis_task
.
file_name
.
split
(
"_"
)
new_file_name
=
file_name_split
[
-
1
]
if
file_name_split
else
analysis_task
.
file_name
data
=
{
"url"
:
file_url
,
"fileName"
:
analysis_task
.
file_name
,
"fileName"
:
new_
file_name
,
"id"
:
analysis_task
.
id
}
return
generate_response
(
data
=
data
)
...
...
projects/web_api/web_api/api/analysis/ext.py
View file @
8b275ea2
...
...
@@ -2,8 +2,8 @@ import os
task_state_map
=
{
0
:
"running"
,
1
:
"
finished
"
,
2
:
"pending"
,
1
:
"
done
"
,
2
:
"pending"
}
...
...
projects/web_api/web_api/api/analysis/models.py
View file @
8b275ea2
...
...
@@ -9,7 +9,7 @@ class AnalysisTask(db.Model):
file_name
=
db
.
Column
(
db
.
Text
,
comment
=
"文件名称"
)
task_type
=
db
.
Column
(
db
.
String
(
128
),
comment
=
"任务类型"
)
is_ocr
=
db
.
Column
(
db
.
Boolean
,
default
=
False
,
comment
=
"是否ocr"
)
status
=
db
.
Column
(
db
.
Integer
,
default
=
0
,
comment
=
"状态"
)
# 0 running 1
finished
2 pending
status
=
db
.
Column
(
db
.
Integer
,
default
=
0
,
comment
=
"状态"
)
# 0 running 1
done
2 pending
analysis_pdf_id
=
db
.
Column
(
db
.
Integer
,
comment
=
"analysis_pdf的id"
)
create_date
=
db
.
Column
(
db
.
DateTime
(),
nullable
=
False
,
default
=
datetime
.
now
)
update_date
=
db
.
Column
(
db
.
DateTime
(),
nullable
=
False
,
default
=
datetime
.
now
,
onupdate
=
datetime
.
now
)
...
...
projects/web_api/web_api/api/analysis/pdf_ext.py
View file @
8b275ea2
...
...
@@ -88,8 +88,9 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
img_name
=
Path
(
img
)
.
name
regex
=
re
.
compile
(
fr
'.*
\
((.*?{img_name})'
)
regex_result
=
regex
.
search
(
md_content
)
img_url
=
url_for
(
'analysis.imgview'
,
filename
=
img_name
,
as_attachment
=
False
)
md_content
=
md_content
.
replace
(
regex_result
.
group
(
1
),
f
"{img_url}&pdf={pdf_name}"
)
if
regex_result
:
img_url
=
url_for
(
'analysis.imgview'
,
filename
=
img_name
,
as_attachment
=
False
)
md_content
=
md_content
.
replace
(
regex_result
.
group
(
1
),
f
"{img_url}&pdf={pdf_name}"
)
full_md_content
=
""
for
item
in
json
.
loads
(
md_content
):
...
...
projects/web_api/web_api/api/analysis/task_view.py
View file @
8b275ea2
...
...
@@ -18,25 +18,33 @@ class TaskView(Resource):
analysis_task_pending
=
AnalysisTask
.
query
.
filter
(
AnalysisTask
.
status
==
2
)
.
order_by
(
AnalysisTask
.
create_date
.
asc
())
.
all
()
pending_total
=
db
.
session
.
query
(
func
.
count
(
AnalysisTask
.
id
))
.
filter
(
AnalysisTask
.
status
==
2
)
.
scalar
()
task_nums
=
pending_total
+
1
data
=
[
{
"queues"
:
task_nums
,
# 正在排队的任务总数
"rank"
:
1
,
"id"
:
analysis_task_running
.
id
,
"url"
:
url_for
(
'analysis.uploadpdfview'
,
filename
=
analysis_task_running
.
file_name
,
as_attachment
=
False
),
"fileName"
:
analysis_task_running
.
file_name
,
"type"
:
analysis_task_running
.
task_type
,
"state"
:
task_state_map
.
get
(
analysis_task_running
.
status
),
}
]
if
analysis_task_running
:
task_nums
=
pending_total
+
1
file_name_split
=
analysis_task_running
.
file_name
.
split
(
"_"
)
new_file_name
=
file_name_split
[
-
1
]
if
file_name_split
else
analysis_task_running
.
file_name
data
=
[
{
"queues"
:
task_nums
,
# 正在排队的任务总数
"rank"
:
1
,
"id"
:
analysis_task_running
.
id
,
"url"
:
url_for
(
'analysis.uploadpdfview'
,
filename
=
analysis_task_running
.
file_name
,
as_attachment
=
False
),
"fileName"
:
new_file_name
,
"type"
:
analysis_task_running
.
task_type
,
"state"
:
task_state_map
.
get
(
analysis_task_running
.
status
),
}
]
else
:
task_nums
=
pending_total
data
=
[]
for
n
,
task
in
enumerate
(
analysis_task_pending
):
file_name_split
=
task
.
file_name
.
split
(
"_"
)
new_file_name
=
file_name_split
[
-
1
]
if
file_name_split
else
task
.
file_name
data
.
append
({
"queues"
:
task_nums
,
# 正在排队的任务总数
"rank"
:
n
+
2
,
"id"
:
task
.
id
,
"url"
:
url_for
(
'analysis.uploadpdfview'
,
filename
=
task
.
file_name
,
as_attachment
=
False
),
"fileName"
:
task
.
file_name
,
"fileName"
:
new_
file_name
,
"type"
:
task
.
task_type
,
"state"
:
task_state_map
.
get
(
task
.
status
),
})
...
...
@@ -59,8 +67,10 @@ class HistoricalTasksView(Resource):
error_out
=
False
)
data
=
[]
for
n
,
task
in
enumerate
(
analysis_task
):
file_name_split
=
task
.
file_name
.
split
(
"_"
)
new_file_name
=
file_name_split
[
-
1
]
if
file_name_split
else
task
.
file_name
data
.
append
({
"fileName"
:
task
.
file_name
,
"fileName"
:
new_
file_name
,
"id"
:
task
.
id
,
"type"
:
task
.
task_type
,
"state"
:
task_state_map
.
get
(
task
.
status
),
...
...
@@ -75,14 +85,11 @@ class HistoricalTasksView(Resource):
class
DeleteTaskView
(
Resource
):
def
delete
(
self
):
def
delete
(
self
,
id
):
"""
删除任务历史记录
:return:
"""
params
=
json
.
loads
(
request
.
data
)
id
=
params
.
get
(
'id'
)
analysis_task
=
AnalysisTask
.
query
.
filter
(
AnalysisTask
.
id
==
id
,
AnalysisTask
.
status
!=
0
)
.
first
()
if
analysis_task
:
analysis_pdf
=
AnalysisPdf
.
query
.
filter
(
AnalysisPdf
.
id
==
AnalysisTask
.
analysis_pdf_id
)
.
first
()
...
...
projects/web_api/web_api/api/analysis/upload_view.py
View file @
8b275ea2
import
json
import
time
import
traceback
import
requests
from
flask
import
request
,
current_app
,
url_for
,
send_from_directory
...
...
@@ -67,8 +68,7 @@ class UploadPdfView(Resource):
upload_dir
=
f
"{current_app.static_folder}/{pdf_upload_folder}"
if
not
Path
(
upload_dir
)
.
exists
():
Path
(
upload_dir
)
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
file_key
=
calculate_file_hash
(
file
)
# new_filename = f"{int(time.time())}_{filename}"
file_key
=
f
"{calculate_file_hash(file)}{int(time.time())}"
new_filename
=
f
"{file_key}_{filename}"
file_path
=
f
"{upload_dir}/{new_filename}"
# file.save(file_path)
...
...
projects/web_api/web_api/common/ext.py
View file @
8b275ea2
...
...
@@ -11,8 +11,8 @@ def is_pdf(filename, file):
:return: 如果文件是PDF格式,则返回True,否则返回False
"""
# 检查文件扩展名 https://arxiv.org/pdf/2405.08702 pdf链接可能存在不带扩展名的情况,先注释
if
not
filename
.
endswith
(
'.pdf'
):
return
False
#
if not filename.endswith('.pdf'):
#
return False
# 检查MIME类型
mime_type
,
_
=
mimetypes
.
guess_type
(
filename
)
...
...
projects/web_api/web_api/config/mineru_web.db
View file @
8b275ea2
No preview for this file type
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment