Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
e36627be
Unverified
Commit
e36627be
authored
Oct 23, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Oct 23, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #777 from myhloli/add-doclayout-yolo
feat: add support for non-PDF file conversion to PDF
parents
d1c0546a
4834baf4
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
28 additions
and
5 deletions
+28
-5
app.py
projects/gradio_app/app.py
+28
-5
No files found.
projects/gradio_app/app.py
View file @
e36627be
...
@@ -3,10 +3,12 @@
...
@@ -3,10 +3,12 @@
import
base64
import
base64
import
os
import
os
import
time
import
time
import
uuid
import
zipfile
import
zipfile
from
pathlib
import
Path
from
pathlib
import
Path
import
re
import
re
import
pymupdf
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.hash_utils
import
compute_sha256
from
magic_pdf.libs.hash_utils
import
compute_sha256
...
@@ -164,12 +166,32 @@ all_lang = [""]
...
@@ -164,12 +166,32 @@ all_lang = [""]
all_lang
.
extend
([
*
other_lang
,
*
latin_lang
,
*
arabic_lang
,
*
cyrillic_lang
,
*
devanagari_lang
])
all_lang
.
extend
([
*
other_lang
,
*
latin_lang
,
*
arabic_lang
,
*
cyrillic_lang
,
*
devanagari_lang
])
def
to_pdf
(
file_path
):
with
pymupdf
.
open
(
file_path
)
as
f
:
if
f
.
is_pdf
:
return
file_path
else
:
pdf_bytes
=
f
.
convert_to_pdf
()
# 将pdfbytes 写入到uuid.pdf中
# 生成唯一的文件名
unique_filename
=
f
"{uuid.uuid4()}.pdf"
# 构建完整的文件路径
tmp_file_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
file_path
),
unique_filename
)
# 将字节数据写入文件
with
open
(
tmp_file_path
,
'wb'
)
as
tmp_pdf_file
:
tmp_pdf_file
.
write
(
pdf_bytes
)
return
tmp_file_path
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
with
gr
.
Blocks
()
as
demo
:
with
gr
.
Blocks
()
as
demo
:
gr
.
HTML
(
header
)
gr
.
HTML
(
header
)
with
gr
.
Row
():
with
gr
.
Row
():
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
pdf_show
=
gr
.
Markdown
(
)
file
=
gr
.
File
(
label
=
"Please upload a PDF or image"
,
file_types
=
[
".pdf"
,
".png"
,
".jpeg"
,
"jpg"
]
)
max_pages
=
gr
.
Slider
(
1
,
10
,
5
,
step
=
1
,
label
=
"Max convert pages"
)
max_pages
=
gr
.
Slider
(
1
,
10
,
5
,
step
=
1
,
label
=
"Max convert pages"
)
with
gr
.
Row
():
with
gr
.
Row
():
layout_mode
=
gr
.
Dropdown
([
"layoutlmv3"
,
"doclayout_yolo"
],
label
=
"Layout model"
,
value
=
"layoutlmv3"
)
layout_mode
=
gr
.
Dropdown
([
"layoutlmv3"
,
"doclayout_yolo"
],
label
=
"Layout model"
,
value
=
"layoutlmv3"
)
...
@@ -180,14 +202,14 @@ if __name__ == "__main__":
...
@@ -180,14 +202,14 @@ if __name__ == "__main__":
table_enable
=
gr
.
Checkbox
(
label
=
"Enable table recognition(test)"
,
value
=
False
)
table_enable
=
gr
.
Checkbox
(
label
=
"Enable table recognition(test)"
,
value
=
False
)
with
gr
.
Row
():
with
gr
.
Row
():
change_bu
=
gr
.
Button
(
"Convert"
)
change_bu
=
gr
.
Button
(
"Convert"
)
clear_bu
=
gr
.
ClearButton
(
[
pdf_show
],
value
=
"Clear"
)
clear_bu
=
gr
.
ClearButton
(
value
=
"Clear"
)
pdf_show
=
PDF
(
label
=
"P
lease upload pdf
"
,
interactive
=
True
,
height
=
800
)
pdf_show
=
PDF
(
label
=
"P
DF preview
"
,
interactive
=
True
,
height
=
800
)
with
gr
.
Accordion
(
"Examples:"
):
with
gr
.
Accordion
(
"Examples:"
):
example_root
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"examples"
)
example_root
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"examples"
)
gr
.
Examples
(
gr
.
Examples
(
examples
=
[
os
.
path
.
join
(
example_root
,
_
)
for
_
in
os
.
listdir
(
example_root
)
if
examples
=
[
os
.
path
.
join
(
example_root
,
_
)
for
_
in
os
.
listdir
(
example_root
)
if
_
.
endswith
(
"pdf"
)],
_
.
endswith
(
"pdf"
)],
inputs
=
pdf_show
,
inputs
=
pdf_show
)
)
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
...
@@ -198,8 +220,9 @@ if __name__ == "__main__":
...
@@ -198,8 +220,9 @@ if __name__ == "__main__":
latex_delimiters
=
latex_delimiters
,
line_breaks
=
True
)
latex_delimiters
=
latex_delimiters
,
line_breaks
=
True
)
with
gr
.
Tab
(
"Markdown text"
):
with
gr
.
Tab
(
"Markdown text"
):
md_text
=
gr
.
TextArea
(
lines
=
45
,
show_copy_button
=
True
)
md_text
=
gr
.
TextArea
(
lines
=
45
,
show_copy_button
=
True
)
file
.
upload
(
fn
=
to_pdf
,
inputs
=
file
,
outputs
=
pdf_show
)
change_bu
.
click
(
fn
=
to_markdown
,
inputs
=
[
pdf_show
,
max_pages
,
is_ocr
,
layout_mode
,
formula_enable
,
table_enable
,
language
],
change_bu
.
click
(
fn
=
to_markdown
,
inputs
=
[
pdf_show
,
max_pages
,
is_ocr
,
layout_mode
,
formula_enable
,
table_enable
,
language
],
outputs
=
[
md
,
md_text
,
output_file
,
pdf_show
])
outputs
=
[
md
,
md_text
,
output_file
,
pdf_show
])
clear_bu
.
add
([
md
,
pdf_show
,
md_text
,
output_file
,
is_ocr
])
clear_bu
.
add
([
file
,
md
,
pdf_show
,
md_text
,
output_file
,
is_ocr
,
table_enable
,
language
])
demo
.
launch
(
server_name
=
"0.0.0.0"
)
demo
.
launch
(
server_name
=
"0.0.0.0"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment