Commit 8b275ea2 authored by houlinfeng's avatar houlinfeng

fix: mineru_web

parent 2730b96b
......@@ -15,4 +15,4 @@ api_v2.add_resource(ImgView, '/analysis/pdf_img')
api_v2.add_resource(MdView, '/analysis/pdf_md')
api_v2.add_resource(TaskView, '/extract/taskQueue')
api_v2.add_resource(HistoricalTasksView, '/extract/list')
api_v2.add_resource(DeleteTaskView, '/extract/task')
\ No newline at end of file
api_v2.add_resource(DeleteTaskView, '/extract/task/<int:id>')
\ No newline at end of file
......@@ -29,12 +29,14 @@ class AnalysisTaskProgressView(Resource):
case 'pdf':
analysis_pdf = AnalysisPdf.query.filter(AnalysisPdf.id == analysis_task.analysis_pdf_id).first()
file_url = url_for('analysis.uploadpdfview', filename=analysis_task.file_name, as_attachment=False)
file_name_split = analysis_task.file_name.split("_")
file_name = file_name_split[-1] if file_name_split else analysis_task.file_name
if analysis_task.status == 0:
data = {
"state": task_state_map.get(analysis_task.status),
"status": analysis_pdf.status,
"url": file_url,
"fileName": analysis_task.file_name,
"fileName": file_name,
"content": [],
"markdownUrl": [],
"fullMdLink": "",
......@@ -47,10 +49,10 @@ class AnalysisTaskProgressView(Resource):
md_link_list = json.loads(analysis_pdf.md_link_list)
full_md_link = analysis_pdf.full_md_link
data = {
"state": task_state_map.get(analysis_task.status),
"state": "failed",
"status": analysis_pdf.status,
"url": file_url,
"fileName": analysis_task.file_name,
"fileName": file_name,
"content": bbox_info,
"markdownUrl": md_link_list,
"fullMdLink": full_md_link,
......@@ -62,7 +64,7 @@ class AnalysisTaskProgressView(Resource):
"state": task_state_map.get(analysis_task.status),
"status": analysis_pdf.status,
"url": file_url,
"fileName": analysis_task.file_name,
"fileName": file_name,
"content": [],
"markdownUrl": [],
"fullMdLink": "",
......@@ -75,7 +77,7 @@ class AnalysisTaskProgressView(Resource):
"state": task_state_map.get(analysis_task.status),
"status": analysis_pdf.status,
"url": file_url,
"fileName": analysis_task.file_name,
"fileName": file_name,
"content": [],
"markdownUrl": [],
"fullMdLink": "",
......@@ -83,13 +85,13 @@ class AnalysisTaskProgressView(Resource):
}
return generate_response(data=data)
case 'formula-detect':
pass
return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
case 'formula-extract':
pass
return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
case 'table-recogn':
return generate_response(code=400, msg="Not yet supported", msgZH="尚不支持")
return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
case _:
return generate_response()
return generate_response(code=400, msg="Not yet supported", msgZH="参数不支持")
class AnalysisTaskView(Resource):
......@@ -181,6 +183,8 @@ class AnalysisTaskView(Resource):
params = json.loads(request.data)
id = params.get('id')
analysis_task = AnalysisTask.query.filter(AnalysisTask.id == id).first()
if not analysis_task:
return generate_response(code=400, msg="Invalid ID", msgZH="无效id")
match analysis_task.task_type:
case 'pdf':
task_r_p = AnalysisTask.query.filter(AnalysisTask.status.in_([0, 2])).first()
......@@ -215,9 +219,11 @@ class AnalysisTaskView(Resource):
# 生成文件的URL路径
file_url = url_for('analysis.uploadpdfview', filename=analysis_task.file_name, as_attachment=False)
file_name_split = analysis_task.file_name.split("_")
new_file_name = file_name_split[-1] if file_name_split else analysis_task.file_name
data = {
"url": file_url,
"fileName": analysis_task.file_name,
"fileName": new_file_name,
"id": analysis_task.id
}
return generate_response(data=data)
......
......@@ -2,8 +2,8 @@ import os
task_state_map = {
0: "running",
1: "finished",
2: "pending",
1: "done",
2: "pending"
}
......
......@@ -9,7 +9,7 @@ class AnalysisTask(db.Model):
file_name = db.Column(db.Text, comment="文件名称")
task_type = db.Column(db.String(128), comment="任务类型")
is_ocr = db.Column(db.Boolean, default=False, comment="是否ocr")
status = db.Column(db.Integer, default=0, comment="状态") # 0 running 1 finished 2 pending
status = db.Column(db.Integer, default=0, comment="状态") # 0 running 1 done 2 pending
analysis_pdf_id = db.Column(db.Integer, comment="analysis_pdf的id")
create_date = db.Column(db.DateTime(), nullable=False, default=datetime.now)
update_date = db.Column(db.DateTime(), nullable=False, default=datetime.now, onupdate=datetime.now)
......
......@@ -88,8 +88,9 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
img_name = Path(img).name
regex = re.compile(fr'.*\((.*?{img_name})')
regex_result = regex.search(md_content)
img_url = url_for('analysis.imgview', filename=img_name, as_attachment=False)
md_content = md_content.replace(regex_result.group(1), f"{img_url}&pdf={pdf_name}")
if regex_result:
img_url = url_for('analysis.imgview', filename=img_name, as_attachment=False)
md_content = md_content.replace(regex_result.group(1), f"{img_url}&pdf={pdf_name}")
full_md_content = ""
for item in json.loads(md_content):
......
......@@ -18,25 +18,33 @@ class TaskView(Resource):
analysis_task_pending = AnalysisTask.query.filter(AnalysisTask.status == 2).order_by(
AnalysisTask.create_date.asc()).all()
pending_total = db.session.query(func.count(AnalysisTask.id)).filter(AnalysisTask.status == 2).scalar()
task_nums = pending_total + 1
data = [
{
"queues": task_nums, # 正在排队的任务总数
"rank": 1,
"id": analysis_task_running.id,
"url": url_for('analysis.uploadpdfview', filename=analysis_task_running.file_name, as_attachment=False),
"fileName": analysis_task_running.file_name,
"type": analysis_task_running.task_type,
"state": task_state_map.get(analysis_task_running.status),
}
]
if analysis_task_running:
task_nums = pending_total + 1
file_name_split = analysis_task_running.file_name.split("_")
new_file_name = file_name_split[-1] if file_name_split else analysis_task_running.file_name
data = [
{
"queues": task_nums, # 正在排队的任务总数
"rank": 1,
"id": analysis_task_running.id,
"url": url_for('analysis.uploadpdfview', filename=analysis_task_running.file_name, as_attachment=False),
"fileName": new_file_name,
"type": analysis_task_running.task_type,
"state": task_state_map.get(analysis_task_running.status),
}
]
else:
task_nums = pending_total
data = []
for n, task in enumerate(analysis_task_pending):
file_name_split = task.file_name.split("_")
new_file_name = file_name_split[-1] if file_name_split else task.file_name
data.append({
"queues": task_nums, # 正在排队的任务总数
"rank": n + 2,
"id": task.id,
"url": url_for('analysis.uploadpdfview', filename=task.file_name, as_attachment=False),
"fileName": task.file_name,
"fileName": new_file_name,
"type": task.task_type,
"state": task_state_map.get(task.status),
})
......@@ -59,8 +67,10 @@ class HistoricalTasksView(Resource):
error_out=False)
data = []
for n, task in enumerate(analysis_task):
file_name_split = task.file_name.split("_")
new_file_name = file_name_split[-1] if file_name_split else task.file_name
data.append({
"fileName": task.file_name,
"fileName": new_file_name,
"id": task.id,
"type": task.task_type,
"state": task_state_map.get(task.status),
......@@ -75,14 +85,11 @@ class HistoricalTasksView(Resource):
class DeleteTaskView(Resource):
def delete(self):
def delete(self, id):
"""
删除任务历史记录
:return:
"""
params = json.loads(request.data)
id = params.get('id')
analysis_task = AnalysisTask.query.filter(AnalysisTask.id == id, AnalysisTask.status != 0).first()
if analysis_task:
analysis_pdf = AnalysisPdf.query.filter(AnalysisPdf.id == AnalysisTask.analysis_pdf_id).first()
......
import json
import time
import traceback
import requests
from flask import request, current_app, url_for, send_from_directory
......@@ -67,8 +68,7 @@ class UploadPdfView(Resource):
upload_dir = f"{current_app.static_folder}/{pdf_upload_folder}"
if not Path(upload_dir).exists():
Path(upload_dir).mkdir(parents=True, exist_ok=True)
file_key = calculate_file_hash(file)
# new_filename = f"{int(time.time())}_{filename}"
file_key = f"{calculate_file_hash(file)}{int(time.time())}"
new_filename = f"{file_key}_{filename}"
file_path = f"{upload_dir}/{new_filename}"
# file.save(file_path)
......
......@@ -11,8 +11,8 @@ def is_pdf(filename, file):
:return: 如果文件是PDF格式,则返回True,否则返回False
"""
# 检查文件扩展名 https://arxiv.org/pdf/2405.08702 pdf链接可能存在不带扩展名的情况,先注释
if not filename.endswith('.pdf'):
return False
# if not filename.endswith('.pdf'):
# return False
# 检查MIME类型
mime_type, _ = mimetypes.guess_type(filename)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment