Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
38086572
Commit
38086572
authored
Sep 20, 2024
by
houlinfeng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat: mineru web and web_api
parent
e92c896a
Changes
16
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
125 additions
and
26 deletions
+125
-26
.gitignore
.gitignore
+2
-1
README.md
projects/web/README.md
+1
-1
package.json
projects/web/package.json
+2
-0
home.tsx
projects/web/src/pages/home.tsx
+3
-3
README.md
projects/web_api/README.md
+24
-15
mineru-web接口文档.html
projects/web_api/mineru-web接口文档.html
+0
-0
requirements.txt
projects/web_api/requirements.txt
+13
-0
__init__.py
projects/web_api/web_api/api/__init__.py
+3
-1
__init__.py
projects/web_api/web_api/api/analysis/__init__.py
+3
-1
analysis_view.py
projects/web_api/web_api/api/analysis/analysis_view.py
+5
-4
markdown_view.py
projects/web_api/web_api/api/analysis/markdown_view.py
+44
-0
extentions.py
projects/web_api/web_api/api/extentions.py
+1
-0
__init__.py
projects/web_api/web_api/api/react_app/__init__.py
+11
-0
react_app_view.py
projects/web_api/web_api/api/react_app/react_app_view.py
+11
-0
config.yaml
projects/web_api/web_api/config/config.yaml
+2
-0
mineru_web.db
projects/web_api/web_api/config/mineru_web.db
+0
-0
No files found.
.gitignore
View file @
38086572
...
...
@@ -38,4 +38,5 @@ source.dev.env
tmp
projects/web/node_modules
\ No newline at end of file
projects/web/node_modules
projects/web/dist
projects/web/README.md
View file @
38086572
...
...
@@ -8,5 +8,5 @@ npm install -g pnpm
3.
build
```
1.pnpm run build
2.npm run buil
2.npm run buil
d
```
\ No newline at end of file
projects/web/package.json
View file @
38086572
...
...
@@ -22,9 +22,11 @@
"ahooks"
:
"^3.8.1"
,
"antd"
:
"^5.20.3"
,
"axios"
:
"^1.7.5"
,
"canvas"
:
"^2.11.2"
,
"classnames"
:
"^2.5.1"
,
"js-cookie"
:
"^3.0.5"
,
"lodash"
:
"^4.17.21"
,
"path2d"
:
"^0.2.1"
,
"qs"
:
"^6.13.0"
,
"react"
:
"^18.3.1"
,
"react-copy-to-clipboard"
:
"^5.1.0"
,
...
...
projects/web/src/pages/home.tsx
View file @
38086572
...
...
@@ -3,7 +3,7 @@
import
ErrorBoundary
from
"@/components/error-boundary"
;
import
styles
from
"./home.module.scss"
;
import
{
SlotID
,
Path
}
from
"@/constant/route"
;
import
{
Browser
Router
,
Routes
,
Route
,
Outlet
}
from
"react-router-dom"
;
import
{
Hash
Router
,
Routes
,
Route
,
Outlet
}
from
"react-router-dom"
;
import
{
ExtractorSide
}
from
"./extract-side"
;
import
{
LanguageProvider
}
from
"@/context/language-provider"
;
import
PDFUpload
from
"@/pages/extract/components/pdf-upload"
;
...
...
@@ -70,9 +70,9 @@ export function Home() {
return
(
<
ErrorBoundary
>
<
LanguageProvider
>
<
Browser
Router
>
<
Hash
Router
>
<
Screen
/>
</
Browser
Router
>
</
Hash
Router
>
</
LanguageProvider
>
</
ErrorBoundary
>
);
...
...
projects/web_api/README.md
View file @
38086572
##
安装
##
Mineru 本地API服务
MinerU
```
bash
# mineru已安装则跳过此步骤
```
# 服务依赖mineru,请先确保mineru已安装
```
git clone https://github.com/opendatalab/MinerU.git
cd
MinerU
1.
打包前端界面
conda create
-n
MinerU
python
=
3.10
conda activate MinerU
pip
install
.[full]
--extra-index-url
https://wheels.myhloli.com
```
bash
# 先进入前端目录
cd
projects/web
# 打包前端项目
npm
install
-g
yarn
yarn
install
yarn build
```
第三方软件
2.
安装服务依赖
```
bash
# 先进入后端目录
cd
projects/web_api
pip
install
poetry
p
ortey
install
# 安装依赖
p
ip3
install
-r
requirements.txt
-i
https://pypi.tuna.tsinghua.edu.cn/simple
```
启动服务
3.
启动服务
```
bash
cd
web_api
python app.py
# 进入程序目录
cd
projects/web_api/web_api
# 启动服务
python3 app.py
# 在浏览器访问启动的地址即可访问界面
```
接口文档
ps:接口文档
```
在浏览器打开 mineru-web接口文档.html
```
projects/web_api/mineru-web接口文档.html
View file @
38086572
This source diff could not be displayed because it is too large. You can
view the blob
instead.
projects/web_api/requirements.txt
0 → 100644
View file @
38086572
flask-cors
flask-jwt-extended
flask-marshmallow
flask-migrate
flask-restful
flask-sqlalchemy
flask
greenlet
loguru
marshmallow-sqlalchemy
marshmallow
pyjwt
pyyaml
projects/web_api/web_api/api/__init__.py
View file @
38086572
...
...
@@ -4,7 +4,7 @@ from common.web_hook import before_request
from
common.logger
import
setup_log
root_dir
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)))
print
(
"root_dir"
,
root_dir
)
def
_register_db
(
flask_app
):
from
common
import
import_models
...
...
@@ -30,6 +30,8 @@ def create_app(config):
ma
.
init_app
(
app
=
app
)
from
.analysis
import
analysis_blue
app
.
register_blueprint
(
analysis_blue
)
from
.react_app
import
react_app_blue
app
.
register_blueprint
(
react_app_blue
)
app
.
before_request
(
before_request
)
...
...
projects/web_api/web_api/api/analysis/__init__.py
View file @
38086572
...
...
@@ -4,6 +4,7 @@ from .upload_view import UploadPdfView
from
.analysis_view
import
AnalysisTaskView
,
AnalysisTaskProgressView
from
.img_md_view
import
ImgView
,
MdView
from
.task_view
import
TaskView
,
HistoricalTasksView
,
DeleteTaskView
from
.markdown_view
import
MarkdownView
analysis_blue
=
Blueprint
(
'analysis'
,
__name__
)
...
...
@@ -15,4 +16,5 @@ api_v2.add_resource(ImgView, '/analysis/pdf_img')
api_v2
.
add_resource
(
MdView
,
'/analysis/pdf_md'
)
api_v2
.
add_resource
(
TaskView
,
'/extract/taskQueue'
)
api_v2
.
add_resource
(
HistoricalTasksView
,
'/extract/list'
)
api_v2
.
add_resource
(
DeleteTaskView
,
'/extract/task/<int:id>'
)
\ No newline at end of file
api_v2
.
add_resource
(
DeleteTaskView
,
'/extract/task/<int:id>'
)
api_v2
.
add_resource
(
MarkdownView
,
'/extract/markdown'
)
\ No newline at end of file
projects/web_api/web_api/api/analysis/analysis_view.py
View file @
38086572
import
json
import
threading
from
multiprocessing
import
Process
from
pathlib
import
Path
from
flask
import
request
,
current_app
,
url_for
from
flask_restful
import
Resource
...
...
@@ -212,10 +213,10 @@ class AnalysisTaskView(Resource):
pdf_analysis_folder
=
current_app
.
config
[
'PDF_ANALYSIS_FOLDER'
]
pdf_dir
=
f
"{current_app.static_folder}/{pdf_analysis_folder}/{file_stem}"
image_dir
=
f
"{pdf_dir}/images"
t
=
threading
.
Thread
(
target
=
analysis_pdf_task
,
args
=
(
pdf_dir
,
image_dir
,
file_path
,
analysis_task
.
is_ocr
,
analysis_task
.
analysis_pdf_id
))
t
.
start
()
process
=
Process
(
target
=
analysis_pdf_task
,
args
=
(
pdf_dir
,
image_dir
,
file_path
,
analysis_task
.
is_ocr
,
analysis_task
.
analysis_pdf_id
))
process
.
start
()
# 生成文件的URL路径
file_url
=
url_for
(
'analysis.uploadpdfview'
,
filename
=
analysis_task
.
file_name
,
as_attachment
=
False
)
...
...
projects/web_api/web_api/api/analysis/markdown_view.py
0 → 100644
View file @
38086572
import
json
from
pathlib
import
Path
from
flask
import
request
,
current_app
from
flask_restful
import
Resource
from
common.custom_response
import
generate_response
class
MarkdownView
(
Resource
):
def
put
(
self
):
"""
编辑markdown
"""
params
=
json
.
loads
(
request
.
data
)
file_key
=
params
.
get
(
'file_key'
)
data
=
params
.
get
(
'data'
,
{})
if
not
data
:
return
generate_response
(
code
=
400
,
msg
=
"empty data"
,
msgZH
=
"数据为空,无法更新markdown"
)
pdf_analysis_folder
=
current_app
.
config
[
'PDF_ANALYSIS_FOLDER'
]
pdf_dir
=
f
"{current_app.static_folder}/{pdf_analysis_folder}"
markdown_file_dir
=
""
for
path_obj
in
Path
(
pdf_dir
)
.
iterdir
():
if
path_obj
.
name
.
startswith
(
file_key
):
markdown_file_dir
=
path_obj
break
if
markdown_file_dir
and
Path
(
markdown_file_dir
)
.
exists
():
for
k
,
v
in
data
.
items
():
md_path
=
f
"{markdown_file_dir}/{k}.md"
if
Path
(
md_path
)
.
exists
():
with
open
(
md_path
,
'w'
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
v
)
full_content
=
""
for
path_obj
in
Path
(
markdown_file_dir
)
.
iterdir
():
if
path_obj
.
is_file
()
and
path_obj
.
suffix
==
".md"
and
path_obj
.
stem
!=
"full"
:
with
open
(
path_obj
,
'r'
,
encoding
=
"utf-8"
)
as
f
:
full_content
+=
f
.
read
()
+
"
\n
"
with
open
(
f
"{markdown_file_dir}/full.md"
,
'w'
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
full_content
)
else
:
return
generate_response
(
code
=
400
,
msg
=
"Invalid file_key"
,
msgZH
=
"文件哈希错误"
)
return
generate_response
()
projects/web_api/web_api/api/extentions.py
View file @
38086572
...
...
@@ -59,3 +59,4 @@ db = SQLAlchemy()
migrate
=
Migrate
()
jwt
=
JWTManager
()
ma
=
Marshmallow
()
folder
=
app
.
config
.
get
(
"REACT_APP_DIST"
)
projects/web_api/web_api/api/react_app/__init__.py
0 → 100644
View file @
38086572
from
pathlib
import
Path
from
flask
import
Blueprint
from
..extentions
import
app
,
Api
from
.react_app_view
import
ReactAppView
from
loguru
import
logger
folder
=
Path
(
app
.
config
.
get
(
"REACT_APP_DIST"
,
"../../web/dist/"
))
.
resolve
()
logger
.
info
(
f
"react_app folder: {folder}"
)
react_app_blue
=
Blueprint
(
'react_app'
,
__name__
,
static_folder
=
folder
,
static_url_path
=
''
,
template_folder
=
folder
)
react_app_api
=
Api
(
react_app_blue
,
prefix
=
''
)
react_app_api
.
add_resource
(
ReactAppView
,
'/'
)
\ No newline at end of file
projects/web_api/web_api/api/react_app/react_app_view.py
0 → 100644
View file @
38086572
from
flask
import
render_template
,
Response
from
flask_restful
import
Resource
class
ReactAppView
(
Resource
):
def
get
(
self
):
# 创建自定义的响应对象
rendered_template
=
render_template
(
'index.html'
)
response
=
Response
(
rendered_template
,
mimetype
=
'text/html'
)
return
response
projects/web_api/web_api/config/config.yaml
View file @
38086572
...
...
@@ -11,6 +11,8 @@ BaseConfig: &base
JWT_ACCESS_TOKEN_EXPIRES
:
3600
PDF_UPLOAD_FOLDER
:
"
upload_pdf"
PDF_ANALYSIS_FOLDER
:
"
analysis_pdf"
# 前端项目打包的路径
REACT_APP_DIST
:
"
../../web/dist/"
# 开发配置
DevelopmentConfig
:
...
...
projects/web_api/web_api/config/mineru_web.db
View file @
38086572
No preview for this file type
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment