Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
98313d4a
Unverified
Commit
98313d4a
authored
Sep 18, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Sep 18, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'dev' into content-list-not-drop
parents
16699a9a
734f9c4c
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
65 additions
and
34 deletions
+65
-34
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+35
-27
AbsPipe.py
magic_pdf/pipe/AbsPipe.py
+6
-2
OCRPipe.py
magic_pdf/pipe/OCRPipe.py
+2
-1
TXTPipe.py
magic_pdf/pipe/TXTPipe.py
+2
-1
UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+2
-1
user_api.py
magic_pdf/user_api.py
+11
-2
app.py
projects/gradio_app/app.py
+7
-0
academic_paper_formula.pdf
projects/gradio_app/examples/academic_paper_formula.pdf
+0
-0
academic_paper_img_formula.pdf
projects/gradio_app/examples/academic_paper_img_formula.pdf
+0
-0
garbled_formula.pdf
projects/gradio_app/examples/garbled_formula.pdf
+0
-0
garbled_formula2.pdf
projects/gradio_app/examples/garbled_formula2.pdf
+0
-0
garbled_img_formula.pdf
projects/gradio_app/examples/garbled_img_formula.pdf
+0
-0
scanned.pdf
projects/gradio_app/examples/scanned.pdf
+0
-0
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
98313d4a
...
@@ -116,17 +116,20 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
...
@@ -116,17 +116,20 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
def
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
def
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
mode
,
mode
,
img_buket_path
=
''
):
img_buket_path
=
''
,
parse_type
=
"auto"
,
lang
=
None
):
page_markdown
=
[]
page_markdown
=
[]
for
para_block
in
paras_of_layout
:
for
para_block
in
paras_of_layout
:
para_text
=
''
para_text
=
''
para_type
=
para_block
[
'type'
]
para_type
=
para_block
[
'type'
]
if
para_type
==
BlockType
.
Text
:
if
para_type
==
BlockType
.
Text
:
para_text
=
merge_para_with_text
(
para_block
)
para_text
=
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
)
elif
para_type
==
BlockType
.
Title
:
elif
para_type
==
BlockType
.
Title
:
para_text
=
f
'# {merge_para_with_text(para_block)}'
para_text
=
f
'# {merge_para_with_text(para_block
, parse_type=parse_type, lang=lang
)}'
elif
para_type
==
BlockType
.
InterlineEquation
:
elif
para_type
==
BlockType
.
InterlineEquation
:
para_text
=
merge_para_with_text
(
para_block
)
para_text
=
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
)
elif
para_type
==
BlockType
.
Image
:
elif
para_type
==
BlockType
.
Image
:
if
mode
==
'nlp'
:
if
mode
==
'nlp'
:
continue
continue
...
@@ -139,17 +142,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
...
@@ -139,17 +142,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text
+=
f
"
\n
})
\n
"
para_text
+=
f
"
\n
})
\n
"
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
para_text
+=
merge_para_with_text
(
block
)
para_text
+=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
para_text
+=
merge_para_with_text
(
block
)
para_text
+=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
elif
para_type
==
BlockType
.
Table
:
elif
para_type
==
BlockType
.
Table
:
if
mode
==
'nlp'
:
if
mode
==
'nlp'
:
continue
continue
elif
mode
==
'mm'
:
elif
mode
==
'mm'
:
for
block
in
para_block
[
'blocks'
]:
# 1st.拼table_caption
for
block
in
para_block
[
'blocks'
]:
# 1st.拼table_caption
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
para_text
+=
merge_para_with_text
(
block
)
para_text
+=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼table_body
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼table_body
if
block
[
'type'
]
==
BlockType
.
TableBody
:
if
block
[
'type'
]
==
BlockType
.
TableBody
:
for
line
in
block
[
'lines'
]:
for
line
in
block
[
'lines'
]:
...
@@ -164,7 +167,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
...
@@ -164,7 +167,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text
+=
f
"
\n
})
\n
"
para_text
+=
f
"
\n
})
\n
"
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼table_footnote
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼table_footnote
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
para_text
+=
merge_para_with_text
(
block
)
para_text
+=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
if
para_text
.
strip
()
==
''
:
if
para_text
.
strip
()
==
''
:
continue
continue
...
@@ -174,7 +177,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
...
@@ -174,7 +177,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
return
page_markdown
return
page_markdown
def
merge_para_with_text
(
para_block
):
def
merge_para_with_text
(
para_block
,
parse_type
=
"auto"
,
lang
=
None
):
def
detect_language
(
text
):
def
detect_language
(
text
):
en_pattern
=
r'[a-zA-Z]+'
en_pattern
=
r'[a-zA-Z]+'
...
@@ -205,11 +208,15 @@ def merge_para_with_text(para_block):
...
@@ -205,11 +208,15 @@ def merge_para_with_text(para_block):
content
=
span
[
'content'
]
content
=
span
[
'content'
]
# language = detect_lang(content)
# language = detect_lang(content)
language
=
detect_language
(
content
)
language
=
detect_language
(
content
)
if
language
==
'en'
:
# 只对英文长词进行分词处理,中文分词会丢失文本
# 判断是否小语种
content
=
ocr_escape_special_markdown_char
(
if
lang
is
not
None
and
lang
!=
'en'
:
split_long_words
(
content
))
else
:
content
=
ocr_escape_special_markdown_char
(
content
)
content
=
ocr_escape_special_markdown_char
(
content
)
else
:
# 非小语种逻辑
if
language
==
'en'
and
parse_type
==
'ocr'
:
# 只对英文长词进行分词处理,中文分词会丢失文本
content
=
ocr_escape_special_markdown_char
(
split_long_words
(
content
))
else
:
content
=
ocr_escape_special_markdown_char
(
content
)
elif
span_type
==
ContentType
.
InlineEquation
:
elif
span_type
==
ContentType
.
InlineEquation
:
content
=
f
" ${span['content']}$ "
content
=
f
" ${span['content']}$ "
elif
span_type
==
ContentType
.
InterlineEquation
:
elif
span_type
==
ContentType
.
InterlineEquation
:
...
@@ -265,24 +272,24 @@ def para_to_standard_format(para, img_buket_path):
...
@@ -265,24 +272,24 @@ def para_to_standard_format(para, img_buket_path):
return
para_content
return
para_content
def
para_to_standard_format_v2
(
para_block
,
img_buket_path
,
page_idx
,
drop_reason
=
None
):
def
para_to_standard_format_v2
(
para_block
,
img_buket_path
,
page_idx
,
parse_type
=
"auto"
,
lang
=
None
,
drop_reason
=
None
):
para_type
=
para_block
[
'type'
]
para_type
=
para_block
[
'type'
]
para_content
=
{}
para_content
=
{}
if
para_type
==
BlockType
.
Text
:
if
para_type
==
BlockType
.
Text
:
para_content
=
{
para_content
=
{
'type'
:
'text'
,
'type'
:
'text'
,
'text'
:
merge_para_with_text
(
para_block
),
'text'
:
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
),
}
}
elif
para_type
==
BlockType
.
Title
:
elif
para_type
==
BlockType
.
Title
:
para_content
=
{
para_content
=
{
'type'
:
'text'
,
'type'
:
'text'
,
'text'
:
merge_para_with_text
(
para_block
),
'text'
:
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
),
'text_level'
:
1
,
'text_level'
:
1
,
}
}
elif
para_type
==
BlockType
.
InterlineEquation
:
elif
para_type
==
BlockType
.
InterlineEquation
:
para_content
=
{
para_content
=
{
'type'
:
'equation'
,
'type'
:
'equation'
,
'text'
:
merge_para_with_text
(
para_block
),
'text'
:
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
),
'text_format'
:
'latex'
,
'text_format'
:
'latex'
,
}
}
elif
para_type
==
BlockType
.
Image
:
elif
para_type
==
BlockType
.
Image
:
...
@@ -293,9 +300,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
...
@@ -293,9 +300,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
img_buket_path
,
img_buket_path
,
block
[
'lines'
][
0
][
'spans'
][
0
][
'image_path'
])
block
[
'lines'
][
0
][
'spans'
][
0
][
'image_path'
])
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
para_content
[
'img_caption'
]
=
merge_para_with_text
(
block
)
para_content
[
'img_caption'
]
=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
para_content
[
'img_footnote'
]
=
merge_para_with_text
(
block
)
para_content
[
'img_footnote'
]
=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
elif
para_type
==
BlockType
.
Table
:
elif
para_type
==
BlockType
.
Table
:
para_content
=
{
'type'
:
'table'
}
para_content
=
{
'type'
:
'table'
}
for
block
in
para_block
[
'blocks'
]:
for
block
in
para_block
[
'blocks'
]:
...
@@ -306,9 +313,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
...
@@ -306,9 +313,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
para_content
[
'table_body'
]
=
f
"
\n\n
{block['lines'][0]['spans'][0]['html']}
\n\n
"
para_content
[
'table_body'
]
=
f
"
\n\n
{block['lines'][0]['spans'][0]['html']}
\n\n
"
para_content
[
'img_path'
]
=
join_path
(
img_buket_path
,
block
[
"lines"
][
0
][
"spans"
][
0
][
'image_path'
])
para_content
[
'img_path'
]
=
join_path
(
img_buket_path
,
block
[
"lines"
][
0
][
"spans"
][
0
][
'image_path'
])
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
para_content
[
'table_caption'
]
=
merge_para_with_text
(
block
)
para_content
[
'table_caption'
]
=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
para_content
[
'table_footnote'
]
=
merge_para_with_text
(
block
)
para_content
[
'table_footnote'
]
=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
para_content
[
'page_idx'
]
=
page_idx
para_content
[
'page_idx'
]
=
page_idx
...
@@ -397,7 +404,9 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
...
@@ -397,7 +404,9 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
def
union_make
(
pdf_info_dict
:
list
,
def
union_make
(
pdf_info_dict
:
list
,
make_mode
:
str
,
make_mode
:
str
,
drop_mode
:
str
,
drop_mode
:
str
,
img_buket_path
:
str
=
''
):
img_buket_path
:
str
=
''
,
parse_type
:
str
=
"auto"
,
lang
=
None
):
output_content
=
[]
output_content
=
[]
for
page_info
in
pdf_info_dict
:
for
page_info
in
pdf_info_dict
:
drop_reason_flag
=
False
drop_reason_flag
=
False
...
@@ -424,21 +433,20 @@ def union_make(pdf_info_dict: list,
...
@@ -424,21 +433,20 @@ def union_make(pdf_info_dict: list,
continue
continue
if
make_mode
==
MakeMode
.
MM_MD
:
if
make_mode
==
MakeMode
.
MM_MD
:
page_markdown
=
ocr_mk_markdown_with_para_core_v2
(
page_markdown
=
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
'mm'
,
img_buket_path
)
paras_of_layout
,
'mm'
,
img_buket_path
,
parse_type
=
parse_type
,
lang
=
lang
)
output_content
.
extend
(
page_markdown
)
output_content
.
extend
(
page_markdown
)
elif
make_mode
==
MakeMode
.
NLP_MD
:
elif
make_mode
==
MakeMode
.
NLP_MD
:
page_markdown
=
ocr_mk_markdown_with_para_core_v2
(
page_markdown
=
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
'nlp'
)
paras_of_layout
,
'nlp'
,
parse_type
=
parse_type
,
lang
=
lang
)
output_content
.
extend
(
page_markdown
)
output_content
.
extend
(
page_markdown
)
elif
make_mode
==
MakeMode
.
STANDARD_FORMAT
:
elif
make_mode
==
MakeMode
.
STANDARD_FORMAT
:
for
para_block
in
paras_of_layout
:
for
para_block
in
paras_of_layout
:
if
drop_reason_flag
:
if
drop_reason_flag
:
para_content
=
para_to_standard_format_v2
(
para_content
=
para_to_standard_format_v2
(
para_block
,
img_buket_path
,
page_idx
,
drop_reason
)
para_block
,
img_buket_path
,
page_idx
,
parse_type
=
parse_type
,
lang
=
lang
,
drop_reason
=
drop_reason
)
else
:
else
:
para_content
=
para_to_standard_format_v2
(
para_content
=
para_to_standard_format_v2
(
para_block
,
img_buket_path
,
page_idx
)
para_block
,
img_buket_path
,
page_idx
,
parse_type
=
parse_type
,
lang
=
lang
)
output_content
.
append
(
para_content
)
output_content
.
append
(
para_content
)
if
make_mode
in
[
MakeMode
.
MM_MD
,
MakeMode
.
NLP_MD
]:
if
make_mode
in
[
MakeMode
.
MM_MD
,
MakeMode
.
NLP_MD
]:
return
'
\n\n
'
.
join
(
output_content
)
return
'
\n\n
'
.
join
(
output_content
)
...
...
magic_pdf/pipe/AbsPipe.py
View file @
98313d4a
...
@@ -95,7 +95,9 @@ class AbsPipe(ABC):
...
@@ -95,7 +95,9 @@ class AbsPipe(ABC):
"""
"""
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
content_list
=
union_make
(
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
drop_mode
,
img_buket_path
)
parse_type
=
pdf_mid_data
[
"_parse_type"
]
lang
=
pdf_mid_data
.
get
(
"_lang"
,
None
)
content_list
=
union_make
(
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
drop_mode
,
img_buket_path
,
parse_type
,
lang
)
return
content_list
return
content_list
@
staticmethod
@
staticmethod
...
@@ -105,7 +107,9 @@ class AbsPipe(ABC):
...
@@ -105,7 +107,9 @@ class AbsPipe(ABC):
"""
"""
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
md_content
=
union_make
(
pdf_info_list
,
md_make_mode
,
drop_mode
,
img_buket_path
)
parse_type
=
pdf_mid_data
[
"_parse_type"
]
lang
=
pdf_mid_data
.
get
(
"_lang"
,
None
)
md_content
=
union_make
(
pdf_info_list
,
md_make_mode
,
drop_mode
,
img_buket_path
,
parse_type
,
lang
)
return
md_content
return
md_content
magic_pdf/pipe/OCRPipe.py
View file @
98313d4a
...
@@ -23,7 +23,8 @@ class OCRPipe(AbsPipe):
...
@@ -23,7 +23,8 @@ class OCRPipe(AbsPipe):
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
result
=
super
()
.
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
result
=
super
()
.
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
...
...
magic_pdf/pipe/TXTPipe.py
View file @
98313d4a
...
@@ -24,7 +24,8 @@ class TXTPipe(AbsPipe):
...
@@ -24,7 +24,8 @@ class TXTPipe(AbsPipe):
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
result
=
super
()
.
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
result
=
super
()
.
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
...
...
magic_pdf/pipe/UNIPipe.py
View file @
98313d4a
...
@@ -44,7 +44,8 @@ class UNIPipe(AbsPipe):
...
@@ -44,7 +44,8 @@ class UNIPipe(AbsPipe):
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
is_debug
=
self
.
is_debug
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
result
=
super
()
.
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
result
=
super
()
.
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
...
...
magic_pdf/user_api.py
View file @
98313d4a
...
@@ -26,7 +26,7 @@ PARSE_TYPE_OCR = "ocr"
...
@@ -26,7 +26,7 @@ PARSE_TYPE_OCR = "ocr"
def
parse_txt_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
def
parse_txt_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page_id
=
0
,
end_page_id
=
None
,
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
*
args
,
**
kwargs
):
*
args
,
**
kwargs
):
"""
"""
解析文本类pdf
解析文本类pdf
...
@@ -44,11 +44,14 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
...
@@ -44,11 +44,14 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
pdf_info_dict
[
"_version_name"
]
=
__version__
pdf_info_dict
[
"_version_name"
]
=
__version__
if
lang
is
not
None
:
pdf_info_dict
[
"_lang"
]
=
lang
return
pdf_info_dict
return
pdf_info_dict
def
parse_ocr_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
def
parse_ocr_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page_id
=
0
,
end_page_id
=
None
,
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
*
args
,
**
kwargs
):
*
args
,
**
kwargs
):
"""
"""
解析ocr类pdf
解析ocr类pdf
...
@@ -66,6 +69,9 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
...
@@ -66,6 +69,9 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
pdf_info_dict
[
"_version_name"
]
=
__version__
pdf_info_dict
[
"_version_name"
]
=
__version__
if
lang
is
not
None
:
pdf_info_dict
[
"_lang"
]
=
lang
return
pdf_info_dict
return
pdf_info_dict
...
@@ -110,4 +116,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
...
@@ -110,4 +116,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
pdf_info_dict
[
"_version_name"
]
=
__version__
pdf_info_dict
[
"_version_name"
]
=
__version__
if
lang
is
not
None
:
pdf_info_dict
[
"_lang"
]
=
lang
return
pdf_info_dict
return
pdf_info_dict
projects/gradio_app/app.py
View file @
98313d4a
...
@@ -150,6 +150,13 @@ if __name__ == "__main__":
...
@@ -150,6 +150,13 @@ if __name__ == "__main__":
change_bu
=
gr
.
Button
(
"Convert"
)
change_bu
=
gr
.
Button
(
"Convert"
)
clear_bu
=
gr
.
ClearButton
([
pdf_show
],
value
=
"Clear"
)
clear_bu
=
gr
.
ClearButton
([
pdf_show
],
value
=
"Clear"
)
pdf_show
=
PDF
(
label
=
"Please upload pdf"
,
interactive
=
True
,
height
=
800
)
pdf_show
=
PDF
(
label
=
"Please upload pdf"
,
interactive
=
True
,
height
=
800
)
with
gr
.
Accordion
(
"Examples:"
):
example_root
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"examples"
)
gr
.
Examples
(
examples
=
[
os
.
path
.
join
(
example_root
,
_
)
for
_
in
os
.
listdir
(
example_root
)
if
_
.
endswith
(
"pdf"
)],
inputs
=
pdf_show
,
)
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
output_file
=
gr
.
File
(
label
=
"convert result"
,
interactive
=
False
)
output_file
=
gr
.
File
(
label
=
"convert result"
,
interactive
=
False
)
...
...
projects/gradio_app/examples/academic_paper_formula.pdf
0 → 100755
View file @
98313d4a
File added
projects/gradio_app/examples/academic_paper_img_formula.pdf
0 → 100755
View file @
98313d4a
File added
projects/gradio_app/examples/garbled_formula.pdf
0 → 100755
View file @
98313d4a
File added
projects/gradio_app/examples/garbled_formula2.pdf
0 → 100755
View file @
98313d4a
File added
projects/gradio_app/examples/garbled_img_formula.pdf
0 → 100755
View file @
98313d4a
File added
projects/gradio_app/examples/scanned.pdf
0 → 100755
View file @
98313d4a
File added
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment