Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
e36627be
Unverified
Commit
e36627be
authored
Oct 23, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Oct 23, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #777 from myhloli/add-doclayout-yolo
feat: add support for non-PDF file conversion to PDF
parents
d1c0546a
4834baf4
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
28 additions
and
5 deletions
+28
-5
app.py
projects/gradio_app/app.py
+28
-5
No files found.
projects/gradio_app/app.py
View file @
e36627be
...
...
@@ -3,10 +3,12 @@
import
base64
import
os
import
time
import
uuid
import
zipfile
from
pathlib
import
Path
import
re
import
pymupdf
from
loguru
import
logger
from
magic_pdf.libs.hash_utils
import
compute_sha256
...
...
@@ -164,12 +166,32 @@ all_lang = [""]
all_lang
.
extend
([
*
other_lang
,
*
latin_lang
,
*
arabic_lang
,
*
cyrillic_lang
,
*
devanagari_lang
])
def
to_pdf
(
file_path
):
with
pymupdf
.
open
(
file_path
)
as
f
:
if
f
.
is_pdf
:
return
file_path
else
:
pdf_bytes
=
f
.
convert_to_pdf
()
# 将pdfbytes 写入到uuid.pdf中
# 生成唯一的文件名
unique_filename
=
f
"{uuid.uuid4()}.pdf"
# 构建完整的文件路径
tmp_file_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
file_path
),
unique_filename
)
# 将字节数据写入文件
with
open
(
tmp_file_path
,
'wb'
)
as
tmp_pdf_file
:
tmp_pdf_file
.
write
(
pdf_bytes
)
return
tmp_file_path
if
__name__
==
"__main__"
:
with
gr
.
Blocks
()
as
demo
:
gr
.
HTML
(
header
)
with
gr
.
Row
():
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
pdf_show
=
gr
.
Markdown
(
)
file
=
gr
.
File
(
label
=
"Please upload a PDF or image"
,
file_types
=
[
".pdf"
,
".png"
,
".jpeg"
,
"jpg"
]
)
max_pages
=
gr
.
Slider
(
1
,
10
,
5
,
step
=
1
,
label
=
"Max convert pages"
)
with
gr
.
Row
():
layout_mode
=
gr
.
Dropdown
([
"layoutlmv3"
,
"doclayout_yolo"
],
label
=
"Layout model"
,
value
=
"layoutlmv3"
)
...
...
@@ -180,14 +202,14 @@ if __name__ == "__main__":
table_enable
=
gr
.
Checkbox
(
label
=
"Enable table recognition(test)"
,
value
=
False
)
with
gr
.
Row
():
change_bu
=
gr
.
Button
(
"Convert"
)
clear_bu
=
gr
.
ClearButton
(
[
pdf_show
],
value
=
"Clear"
)
pdf_show
=
PDF
(
label
=
"P
lease upload pdf
"
,
interactive
=
True
,
height
=
800
)
clear_bu
=
gr
.
ClearButton
(
value
=
"Clear"
)
pdf_show
=
PDF
(
label
=
"P
DF preview
"
,
interactive
=
True
,
height
=
800
)
with
gr
.
Accordion
(
"Examples:"
):
example_root
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"examples"
)
gr
.
Examples
(
examples
=
[
os
.
path
.
join
(
example_root
,
_
)
for
_
in
os
.
listdir
(
example_root
)
if
_
.
endswith
(
"pdf"
)],
inputs
=
pdf_show
,
inputs
=
pdf_show
)
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
...
...
@@ -198,8 +220,9 @@ if __name__ == "__main__":
latex_delimiters
=
latex_delimiters
,
line_breaks
=
True
)
with
gr
.
Tab
(
"Markdown text"
):
md_text
=
gr
.
TextArea
(
lines
=
45
,
show_copy_button
=
True
)
file
.
upload
(
fn
=
to_pdf
,
inputs
=
file
,
outputs
=
pdf_show
)
change_bu
.
click
(
fn
=
to_markdown
,
inputs
=
[
pdf_show
,
max_pages
,
is_ocr
,
layout_mode
,
formula_enable
,
table_enable
,
language
],
outputs
=
[
md
,
md_text
,
output_file
,
pdf_show
])
clear_bu
.
add
([
md
,
pdf_show
,
md_text
,
output_file
,
is_ocr
])
clear_bu
.
add
([
file
,
md
,
pdf_show
,
md_text
,
output_file
,
is_ocr
,
table_enable
,
language
])
demo
.
launch
(
server_name
=
"0.0.0.0"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment