Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
f9df92aa
Commit
f9df92aa
authored
Sep 12, 2024
by
quyuan
Browse files
Options
Browse Files
Download
Plain Diff
fix: add magic-pdf-dev case
parents
8df8737e
0140d7d2
Changes
21
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
385 additions
and
1 deletion
+385
-1
bug_report.yml
.github/ISSUE_TEMPLATE/bug_report.yml
+1
-0
README.md
README.md
+9
-1
README_zh-CN.md
README_zh-CN.md
+12
-0
app.py
app.py
+167
-0
version.py
magic_pdf/libs/version.py
+4
-0
doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+4
-0
pdf_extract_kit.py
magic_pdf/model/pdf_extract_kit.py
+13
-0
AbsPipe.py
magic_pdf/pipe/AbsPipe.py
+7
-0
OCRPipe.py
magic_pdf/pipe/OCRPipe.py
+9
-0
TXTPipe.py
magic_pdf/pipe/TXTPipe.py
+9
-0
UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+17
-0
model_configs.yaml
magic_pdf/resources/model_config/model_configs.yaml
+4
-0
cli.py
magic_pdf/tools/cli.py
+10
-0
common.py
magic_pdf/tools/common.py
+13
-0
user_api.py
magic_pdf/user_api.py
+10
-0
README.md
projects/README.md
+3
-0
README_zh-CN.md
projects/README_zh-CN.md
+3
-0
README_zh-CN.md
projects/llama_index_rag/README_zh-CN.md
+18
-0
test_bench.py
tests/test_cli/test_bench.py
+54
-0
table.jpg
tests/test_table/assets/table.jpg
+0
-0
test_tablemaster.py
tests/test_table/test_tablemaster.py
+18
-0
No files found.
.github/ISSUE_TEMPLATE/bug_report.yml
View file @
f9df92aa
...
@@ -80,6 +80,7 @@ body:
...
@@ -80,6 +80,7 @@ body:
-
-
-
"
0.6.x"
-
"
0.6.x"
-
"
0.7.x"
-
"
0.7.x"
-
"
0.8.x"
validations
:
validations
:
required
:
true
required
:
true
...
...
README.md
View file @
f9df92aa
This diff is collapsed.
Click to expand it.
README_zh-CN.md
View file @
f9df92aa
This diff is collapsed.
Click to expand it.
app.py
0 → 100644
View file @
f9df92aa
# Copyright (c) Opendatalab. All rights reserved.
import
base64
import
os
import
time
import
zipfile
from
pathlib
import
Path
import
re
from
loguru
import
logger
from
magic_pdf.libs.hash_utils
import
compute_sha256
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.tools.common
import
do_parse
,
prepare_env
os
.
system
(
"pip install gradio"
)
os
.
system
(
"pip install gradio-pdf"
)
import
gradio
as
gr
from
gradio_pdf
import
PDF
def
read_fn
(
path
):
disk_rw
=
DiskReaderWriter
(
os
.
path
.
dirname
(
path
))
return
disk_rw
.
read
(
os
.
path
.
basename
(
path
),
AbsReaderWriter
.
MODE_BIN
)
def
parse_pdf
(
doc_path
,
output_dir
,
end_page_id
):
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
try
:
file_name
=
f
"{str(Path(doc_path).stem)}_{time.time()}"
pdf_data
=
read_fn
(
doc_path
)
parse_method
=
"auto"
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
file_name
,
parse_method
)
do_parse
(
output_dir
,
file_name
,
pdf_data
,
[],
parse_method
,
False
,
end_page_id
=
end_page_id
,
)
return
local_md_dir
,
file_name
except
Exception
as
e
:
logger
.
exception
(
e
)
def
compress_directory_to_zip
(
directory_path
,
output_zip_path
):
"""
压缩指定目录到一个 ZIP 文件。
:param directory_path: 要压缩的目录路径
:param output_zip_path: 输出的 ZIP 文件路径
"""
try
:
with
zipfile
.
ZipFile
(
output_zip_path
,
'w'
,
zipfile
.
ZIP_DEFLATED
)
as
zipf
:
# 遍历目录中的所有文件和子目录
for
root
,
dirs
,
files
in
os
.
walk
(
directory_path
):
for
file
in
files
:
# 构建完整的文件路径
file_path
=
os
.
path
.
join
(
root
,
file
)
# 计算相对路径
arcname
=
os
.
path
.
relpath
(
file_path
,
directory_path
)
# 添加文件到 ZIP 文件
zipf
.
write
(
file_path
,
arcname
)
return
0
except
Exception
as
e
:
logger
.
exception
(
e
)
return
-
1
def
image_to_base64
(
image_path
):
with
open
(
image_path
,
"rb"
)
as
image_file
:
return
base64
.
b64encode
(
image_file
.
read
())
.
decode
(
'utf-8'
)
def
replace_image_with_base64
(
markdown_text
,
image_dir_path
):
# 匹配Markdown中的图片标签
pattern
=
r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
# 替换图片链接
def
replace
(
match
):
relative_path
=
match
.
group
(
1
)
full_path
=
os
.
path
.
join
(
image_dir_path
,
relative_path
)
base64_image
=
image_to_base64
(
full_path
)
return
f
""
# 应用替换
return
re
.
sub
(
pattern
,
replace
,
markdown_text
)
def
to_markdown
(
file_path
,
end_pages
):
# 获取识别的md文件以及压缩包文件路径
local_md_dir
,
file_name
=
parse_pdf
(
file_path
,
'./output'
,
end_pages
-
1
)
archive_zip_path
=
os
.
path
.
join
(
"./output"
,
compute_sha256
(
local_md_dir
)
+
".zip"
)
zip_archive_success
=
compress_directory_to_zip
(
local_md_dir
,
archive_zip_path
)
if
zip_archive_success
==
0
:
logger
.
info
(
"压缩成功"
)
else
:
logger
.
error
(
"压缩失败"
)
md_path
=
os
.
path
.
join
(
local_md_dir
,
file_name
+
".md"
)
with
open
(
md_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
txt_content
=
f
.
read
()
md_content
=
replace_image_with_base64
(
txt_content
,
local_md_dir
)
# 返回转换后的PDF路径
new_pdf_path
=
os
.
path
.
join
(
local_md_dir
,
file_name
+
"_layout.pdf"
)
return
md_content
,
txt_content
,
archive_zip_path
,
new_pdf_path
# def show_pdf(file_path):
# with open(file_path, "rb") as f:
# base64_pdf = base64.b64encode(f.read()).decode('utf-8')
# pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" ' \
# f'width="100%" height="1000" type="application/pdf">'
# return pdf_display
latex_delimiters
=
[{
"left"
:
"$$"
,
"right"
:
"$$"
,
"display"
:
True
},
{
"left"
:
'$'
,
"right"
:
'$'
,
"display"
:
False
}]
def
init_model
():
from
magic_pdf.model.doc_analyze_by_custom_model
import
ModelSingleton
try
:
model_manager
=
ModelSingleton
()
txt_model
=
model_manager
.
get_model
(
False
,
False
)
logger
.
info
(
f
"txt_model init final"
)
ocr_model
=
model_manager
.
get_model
(
True
,
False
)
logger
.
info
(
f
"ocr_model init final"
)
return
0
except
Exception
as
e
:
logger
.
exception
(
e
)
return
-
1
model_init
=
init_model
()
logger
.
info
(
f
"model_init: {model_init}"
)
if
__name__
==
"__main__"
:
with
gr
.
Blocks
()
as
demo
:
with
gr
.
Row
():
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
pdf_show
=
gr
.
Markdown
()
max_pages
=
gr
.
Slider
(
1
,
10
,
5
,
step
=
1
,
label
=
"Max convert pages"
)
with
gr
.
Row
()
as
bu_flow
:
change_bu
=
gr
.
Button
(
"Convert"
)
clear_bu
=
gr
.
ClearButton
([
pdf_show
],
value
=
"Clear"
)
pdf_show
=
PDF
(
label
=
"Please upload pdf"
,
interactive
=
True
,
height
=
800
)
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
output_file
=
gr
.
File
(
label
=
"convert result"
,
interactive
=
False
)
with
gr
.
Tabs
():
with
gr
.
Tab
(
"Markdown rendering"
):
md
=
gr
.
Markdown
(
label
=
"Markdown rendering"
,
height
=
900
,
show_copy_button
=
True
,
latex_delimiters
=
latex_delimiters
,
line_breaks
=
True
)
with
gr
.
Tab
(
"Markdown text"
):
md_text
=
gr
.
TextArea
(
lines
=
45
,
show_copy_button
=
True
)
change_bu
.
click
(
fn
=
to_markdown
,
inputs
=
[
pdf_show
,
max_pages
],
outputs
=
[
md
,
md_text
,
output_file
,
pdf_show
])
clear_bu
.
add
([
md
,
pdf_show
,
md_text
,
output_file
])
demo
.
launch
()
magic_pdf/libs/version.py
View file @
f9df92aa
<<<<<<<
HEAD
__version__
=
"0.7.1"
__version__
=
"0.7.1"
=======
__version__
=
"0.8.0"
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
f9df92aa
...
@@ -106,7 +106,11 @@ def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None):
...
@@ -106,7 +106,11 @@ def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None):
def
doc_analyze
(
pdf_bytes
:
bytes
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
,
def
doc_analyze
(
pdf_bytes
:
bytes
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
,
<<<<<<<
HEAD
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
=======
start_page_id
=
0
,
end_page_id
=
None
):
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
model_manager
=
ModelSingleton
()
model_manager
=
ModelSingleton
()
custom_model
=
model_manager
.
get_model
(
ocr
,
show_log
,
lang
)
custom_model
=
model_manager
.
get_model
(
ocr
,
show_log
,
lang
)
...
...
magic_pdf/model/pdf_extract_kit.py
View file @
f9df92aa
...
@@ -74,11 +74,16 @@ def layout_model_init(weight, config_file, device):
...
@@ -74,11 +74,16 @@ def layout_model_init(weight, config_file, device):
return
model
return
model
<<<<<<<
HEAD
def
ocr_model_init
(
show_log
:
bool
=
False
,
det_db_box_thresh
=
0.3
,
lang
=
None
):
def
ocr_model_init
(
show_log
:
bool
=
False
,
det_db_box_thresh
=
0.3
,
lang
=
None
):
if
lang
is
not
None
:
if
lang
is
not
None
:
model
=
ModifiedPaddleOCR
(
show_log
=
show_log
,
det_db_box_thresh
=
det_db_box_thresh
,
lang
=
lang
)
model
=
ModifiedPaddleOCR
(
show_log
=
show_log
,
det_db_box_thresh
=
det_db_box_thresh
,
lang
=
lang
)
else
:
else
:
model
=
ModifiedPaddleOCR
(
show_log
=
show_log
,
det_db_box_thresh
=
det_db_box_thresh
)
model
=
ModifiedPaddleOCR
(
show_log
=
show_log
,
det_db_box_thresh
=
det_db_box_thresh
)
=======
def
ocr_model_init
(
show_log
:
bool
=
False
,
det_db_box_thresh
=
0.3
):
model
=
ModifiedPaddleOCR
(
show_log
=
show_log
,
det_db_box_thresh
=
det_db_box_thresh
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
return
model
return
model
...
@@ -137,8 +142,12 @@ def atom_model_init(model_name: str, **kwargs):
...
@@ -137,8 +142,12 @@ def atom_model_init(model_name: str, **kwargs):
elif
model_name
==
AtomicModel
.
OCR
:
elif
model_name
==
AtomicModel
.
OCR
:
atom_model
=
ocr_model_init
(
atom_model
=
ocr_model_init
(
kwargs
.
get
(
"ocr_show_log"
),
kwargs
.
get
(
"ocr_show_log"
),
<<<<<<<
HEAD
kwargs
.
get
(
"det_db_box_thresh"
),
kwargs
.
get
(
"det_db_box_thresh"
),
kwargs
.
get
(
"lang"
)
kwargs
.
get
(
"lang"
)
=======
kwargs
.
get
(
"det_db_box_thresh"
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
)
)
elif
model_name
==
AtomicModel
.
Table
:
elif
model_name
==
AtomicModel
.
Table
:
atom_model
=
table_model_init
(
atom_model
=
table_model_init
(
...
@@ -235,8 +244,12 @@ class CustomPEKModel:
...
@@ -235,8 +244,12 @@ class CustomPEKModel:
self
.
ocr_model
=
atom_model_manager
.
get_atom_model
(
self
.
ocr_model
=
atom_model_manager
.
get_atom_model
(
atom_model_name
=
AtomicModel
.
OCR
,
atom_model_name
=
AtomicModel
.
OCR
,
ocr_show_log
=
show_log
,
ocr_show_log
=
show_log
,
<<<<<<<
HEAD
det_db_box_thresh
=
0.3
,
det_db_box_thresh
=
0.3
,
lang
=
self
.
lang
lang
=
self
.
lang
=======
det_db_box_thresh
=
0.3
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
)
)
# init table model
# init table model
if
self
.
apply_table
:
if
self
.
apply_table
:
...
...
magic_pdf/pipe/AbsPipe.py
View file @
f9df92aa
...
@@ -17,7 +17,11 @@ class AbsPipe(ABC):
...
@@ -17,7 +17,11 @@ class AbsPipe(ABC):
PIP_TXT
=
"txt"
PIP_TXT
=
"txt"
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
<<<<<<<
HEAD
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
=======
start_page_id
=
0
,
end_page_id
=
None
):
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
self
.
pdf_bytes
=
pdf_bytes
self
.
pdf_bytes
=
pdf_bytes
self
.
model_list
=
model_list
self
.
model_list
=
model_list
self
.
image_writer
=
image_writer
self
.
image_writer
=
image_writer
...
@@ -25,7 +29,10 @@ class AbsPipe(ABC):
...
@@ -25,7 +29,10 @@ class AbsPipe(ABC):
self
.
is_debug
=
is_debug
self
.
is_debug
=
is_debug
self
.
start_page_id
=
start_page_id
self
.
start_page_id
=
start_page_id
self
.
end_page_id
=
end_page_id
self
.
end_page_id
=
end_page_id
<<<<<<<
HEAD
self
.
lang
=
lang
self
.
lang
=
lang
=======
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
def
get_compress_pdf_mid_data
(
self
):
def
get_compress_pdf_mid_data
(
self
):
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
...
...
magic_pdf/pipe/OCRPipe.py
View file @
f9df92aa
...
@@ -10,16 +10,25 @@ from magic_pdf.user_api import parse_ocr_pdf
...
@@ -10,16 +10,25 @@ from magic_pdf.user_api import parse_ocr_pdf
class
OCRPipe
(
AbsPipe
):
class
OCRPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
<<<<<<<
HEAD
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
=======
start_page_id
=
0
,
end_page_id
=
None
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
def
pipe_classify
(
self
):
def
pipe_classify
(
self
):
pass
pass
def
pipe_analyze
(
self
):
def
pipe_analyze
(
self
):
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
<<<<<<<
HEAD
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
lang
=
self
.
lang
)
=======
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
...
...
magic_pdf/pipe/TXTPipe.py
View file @
f9df92aa
...
@@ -11,16 +11,25 @@ from magic_pdf.user_api import parse_txt_pdf
...
@@ -11,16 +11,25 @@ from magic_pdf.user_api import parse_txt_pdf
class
TXTPipe
(
AbsPipe
):
class
TXTPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
<<<<<<<
HEAD
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
=======
start_page_id
=
0
,
end_page_id
=
None
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
def
pipe_classify
(
self
):
def
pipe_classify
(
self
):
pass
pass
def
pipe_analyze
(
self
):
def
pipe_analyze
(
self
):
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
,
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
,
<<<<<<<
HEAD
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
lang
=
self
.
lang
)
=======
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
...
...
magic_pdf/pipe/UNIPipe.py
View file @
f9df92aa
...
@@ -14,9 +14,15 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
...
@@ -14,9 +14,15 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class
UNIPipe
(
AbsPipe
):
class
UNIPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
jso_useful_key
:
dict
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
def
__init__
(
self
,
pdf_bytes
:
bytes
,
jso_useful_key
:
dict
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
<<<<<<<
HEAD
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
self
.
pdf_type
=
jso_useful_key
[
"_pdf_type"
]
self
.
pdf_type
=
jso_useful_key
[
"_pdf_type"
]
super
()
.
__init__
(
pdf_bytes
,
jso_useful_key
[
"model_list"
],
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
super
()
.
__init__
(
pdf_bytes
,
jso_useful_key
[
"model_list"
],
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
=======
start_page_id
=
0
,
end_page_id
=
None
):
self
.
pdf_type
=
jso_useful_key
[
"_pdf_type"
]
super
()
.
__init__
(
pdf_bytes
,
jso_useful_key
[
"model_list"
],
image_writer
,
is_debug
,
start_page_id
,
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
if
len
(
self
.
model_list
)
==
0
:
if
len
(
self
.
model_list
)
==
0
:
self
.
input_model_is_empty
=
True
self
.
input_model_is_empty
=
True
else
:
else
:
...
@@ -28,19 +34,30 @@ class UNIPipe(AbsPipe):
...
@@ -28,19 +34,30 @@ class UNIPipe(AbsPipe):
def
pipe_analyze
(
self
):
def
pipe_analyze
(
self
):
if
self
.
pdf_type
==
self
.
PIP_TXT
:
if
self
.
pdf_type
==
self
.
PIP_TXT
:
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
,
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
,
<<<<<<<
HEAD
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
lang
=
self
.
lang
)
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
lang
=
self
.
lang
)
=======
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
if
self
.
pdf_type
==
self
.
PIP_TXT
:
if
self
.
pdf_type
==
self
.
PIP_TXT
:
self
.
pdf_mid_data
=
parse_union_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
self
.
pdf_mid_data
=
parse_union_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
input_model_is_empty
=
self
.
input_model_is_empty
,
is_debug
=
self
.
is_debug
,
input_model_is_empty
=
self
.
input_model_is_empty
,
<<<<<<<
HEAD
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
lang
=
self
.
lang
)
=======
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
is_debug
=
self
.
is_debug
,
...
...
magic_pdf/resources/model_config/model_configs.yaml
View file @
f9df92aa
...
@@ -10,6 +10,10 @@ config:
...
@@ -10,6 +10,10 @@ config:
weights
:
weights
:
layout
:
Layout/model_final.pth
layout
:
Layout/model_final.pth
mfd
:
MFD/weights.pt
mfd
:
MFD/weights.pt
<<<<<<< HEAD
mfr
:
MFR/unimernet_base
mfr
:
MFR/unimernet_base
=======
mfr
:
MFR/UniMERNet
>
>>>>>>
0140d7d271ac3b1561ca2272030e9e038b469999
struct_eqtable: TabRec/StructEqTable
struct_eqtable: TabRec/StructEqTable
TableMaster: TabRec/TableMaster
TableMaster: TabRec/TableMaster
\ No newline at end of file
magic_pdf/tools/cli.py
View file @
f9df92aa
...
@@ -45,6 +45,7 @@ without method specified, auto will be used by default.""",
...
@@ -45,6 +45,7 @@ without method specified, auto will be used by default.""",
default
=
'auto'
,
default
=
'auto'
,
)
)
@
click
.
option
(
@
click
.
option
(
<<<<<<<
HEAD
'-l'
,
'-l'
,
'--lang'
,
'--lang'
,
'lang'
,
'lang'
,
...
@@ -57,6 +58,8 @@ without method specified, auto will be used by default.""",
...
@@ -57,6 +58,8 @@ without method specified, auto will be used by default.""",
default
=
None
,
default
=
None
,
)
)
@
click
.
option
(
@
click
.
option
(
=======
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
'-d'
,
'-d'
,
'--debug'
,
'--debug'
,
'debug_able'
,
'debug_able'
,
...
@@ -80,7 +83,11 @@ without method specified, auto will be used by default.""",
...
@@ -80,7 +83,11 @@ without method specified, auto will be used by default.""",
help
=
'The ending page for PDF parsing, beginning from 0.'
,
help
=
'The ending page for PDF parsing, beginning from 0.'
,
default
=
None
,
default
=
None
,
)
)
<<<<<<<
HEAD
def
cli
(
path
,
output_dir
,
method
,
lang
,
debug_able
,
start_page_id
,
end_page_id
):
def
cli
(
path
,
output_dir
,
method
,
lang
,
debug_able
,
start_page_id
,
end_page_id
):
=======
def
cli
(
path
,
output_dir
,
method
,
debug_able
,
start_page_id
,
end_page_id
):
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
model_config
.
__use_inside_model__
=
True
model_config
.
__use_inside_model__
=
True
model_config
.
__model_mode__
=
'full'
model_config
.
__model_mode__
=
'full'
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
...
@@ -102,7 +109,10 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
...
@@ -102,7 +109,10 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
debug_able
,
debug_able
,
start_page_id
=
start_page_id
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
end_page_id
=
end_page_id
,
<<<<<<<
HEAD
lang
=
lang
lang
=
lang
=======
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
)
)
except
Exception
as
e
:
except
Exception
as
e
:
...
...
magic_pdf/tools/common.py
View file @
f9df92aa
...
@@ -44,7 +44,10 @@ def do_parse(
...
@@ -44,7 +44,10 @@ def do_parse(
f_draw_model_bbox
=
False
,
f_draw_model_bbox
=
False
,
start_page_id
=
0
,
start_page_id
=
0
,
end_page_id
=
None
,
end_page_id
=
None
,
<<<<<<<
HEAD
lang
=
None
,
lang
=
None
,
=======
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
):
):
if
debug_able
:
if
debug_able
:
logger
.
warning
(
"debug mode is on"
)
logger
.
warning
(
"debug mode is on"
)
...
@@ -62,6 +65,7 @@ def do_parse(
...
@@ -62,6 +65,7 @@ def do_parse(
if
parse_method
==
'auto'
:
if
parse_method
==
'auto'
:
jso_useful_key
=
{
'_pdf_type'
:
''
,
'model_list'
:
model_list
}
jso_useful_key
=
{
'_pdf_type'
:
''
,
'model_list'
:
model_list
}
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
,
is_debug
=
True
,
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
,
is_debug
=
True
,
<<<<<<<
HEAD
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
elif
parse_method
==
'txt'
:
elif
parse_method
==
'txt'
:
pipe
=
TXTPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
,
pipe
=
TXTPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
,
...
@@ -69,6 +73,15 @@ def do_parse(
...
@@ -69,6 +73,15 @@ def do_parse(
elif
parse_method
==
'ocr'
:
elif
parse_method
==
'ocr'
:
pipe
=
OCRPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
,
pipe
=
OCRPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
=======
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
)
elif
parse_method
==
'txt'
:
pipe
=
TXTPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
)
elif
parse_method
==
'ocr'
:
pipe
=
OCRPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
else
:
else
:
logger
.
error
(
'unknown parse method'
)
logger
.
error
(
'unknown parse method'
)
exit
(
1
)
exit
(
1
)
...
...
magic_pdf/user_api.py
View file @
f9df92aa
...
@@ -71,7 +71,11 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
...
@@ -71,7 +71,11 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
input_model_is_empty
:
bool
=
False
,
input_model_is_empty
:
bool
=
False
,
<<<<<<<
HEAD
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
=======
start_page_id
=
0
,
end_page_id
=
None
,
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
*
args
,
**
kwargs
):
*
args
,
**
kwargs
):
"""
"""
ocr和文本混合的pdf,全部解析出来
ocr和文本混合的pdf,全部解析出来
...
@@ -95,11 +99,17 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
...
@@ -95,11 +99,17 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"_need_drop"
,
False
):
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"_need_drop"
,
False
):
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
if
input_model_is_empty
:
if
input_model_is_empty
:
<<<<<<<
HEAD
pdf_models
=
doc_analyze
(
pdf_bytes
,
pdf_models
=
doc_analyze
(
pdf_bytes
,
ocr
=
True
,
ocr
=
True
,
start_page_id
=
start_page_id
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
lang
=
lang
)
=======
pdf_models
=
doc_analyze
(
pdf_bytes
,
ocr
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
if
pdf_info_dict
is
None
:
if
pdf_info_dict
is
None
:
raise
Exception
(
"Both parse_pdf_by_txt and parse_pdf_by_ocr failed."
)
raise
Exception
(
"Both parse_pdf_by_txt and parse_pdf_by_ocr failed."
)
...
...
projects/README.md
View file @
f9df92aa
...
@@ -3,6 +3,9 @@
...
@@ -3,6 +3,9 @@
## Project List
## Project List
-
[
llama_index_rag
](
./llama_index_rag/README.md
)
: Build a lightweight RAG system based on llama_index
-
[
llama_index_rag
](
./llama_index_rag/README.md
)
: Build a lightweight RAG system based on llama_index
<<<<<<< HEAD
-
[
gradio_app
](
./gradio_app/README.md
)
: Build a web app based on gradio
-
[
gradio_app
](
./gradio_app/README.md
)
: Build a web app based on gradio
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
projects/README_zh-CN.md
View file @
f9df92aa
...
@@ -3,5 +3,8 @@
...
@@ -3,5 +3,8 @@
## 项目列表
## 项目列表
-
[
llama_index_rag
](
./llama_index_rag/README_zh-CN.md
)
: 基于 llama_index 构建轻量级 RAG 系统
-
[
llama_index_rag
](
./llama_index_rag/README_zh-CN.md
)
: 基于 llama_index 构建轻量级 RAG 系统
<<<<<<< HEAD
-
[
gradio_app
](
./gradio_app/README_zh-CN.md
)
: 基于 Gradio 的 Web 应用
-
[
gradio_app
](
./gradio_app/README_zh-CN.md
)
: 基于 Gradio 的 Web 应用
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
projects/llama_index_rag/README_zh-CN.md
View file @
f9df92aa
...
@@ -59,7 +59,10 @@ Server: Docker Engine - Community
...
@@ -59,7 +59,10 @@ Server: Docker Engine - Community
```
bash
```
bash
# install
# install
pip
install
modelscope
==
1.14.0
pip
install
modelscope
==
1.14.0
<<<<<<
< HEAD
=======
>>>>>>>
0140d7d271ac3b1561ca2272030e9e038b469999
pip
install
llama-index-vector-stores-elasticsearch
==
0.2.0
pip
install
llama-index-vector-stores-elasticsearch
==
0.2.0
pip
install
llama-index-embeddings-dashscope
==
0.2.0
pip
install
llama-index-embeddings-dashscope
==
0.2.0
pip
install
llama-index-core
==
0.10.68
pip
install
llama-index-core
==
0.10.68
...
@@ -71,13 +74,19 @@ pip install accelerate==0.33.0
...
@@ -71,13 +74,19 @@ pip install accelerate==0.33.0
pip uninstall transformer-engine
pip uninstall transformer-engine
```
```
<<<<<<< HEAD
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
## 示例
## 示例
```
`bash
```
`bash
cd projects/llama_index_rag
cd projects/llama_index_rag
<<<<<<< HEAD
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
docker compose up -d
docker compose up -d
or
or
...
@@ -85,14 +94,20 @@ or
...
@@ -85,14 +94,20 @@ or
docker-compose up -d
docker-compose up -d
<<<<<<< HEAD
# 配置环境变量
# 配置环境变量
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
export ES_USER=elastic
export ES_USER=elastic
export ES_PASSWORD=llama_index
export ES_PASSWORD=llama_index
export ES_URL=http://127.0.0.1:9200
export ES_URL=http://127.0.0.1:9200
export DASHSCOPE_API_KEY={some_key}
export DASHSCOPE_API_KEY={some_key}
<<<<<<< HEAD
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
DASHSCOPE_API_KEY 开通参考[文档](https://help.aliyun.com/zh/dashscope/opening-service)
DASHSCOPE_API_KEY 开通参考[文档](https://help.aliyun.com/zh/dashscope/opening-service)
# 未导入数据,查询问题。返回通义千问默认答案
# 未导入数据,查询问题。返回通义千问默认答案
...
@@ -120,7 +135,10 @@ python data_ingestion.py -p example/data/declaration_of_the_rights_of_man_1789.p
...
@@ -120,7 +135,10 @@ python data_ingestion.py -p example/data/declaration_of_the_rights_of_man_1789.p
# 导入数据后,查询问题。通义千问模型会根据 RAG 系统的检索结果,结合上下文,给出答案。
# 导入数据后,查询问题。通义千问模型会根据 RAG 系统的检索结果,结合上下文,给出答案。
<<<<<<< HEAD
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
python query.py -q 'how about the rights of men'
python query.py -q 'how about the rights of men'
## outputs
## outputs
...
...
tests/test_cli/test_bench.py
0 → 100644
View file @
f9df92aa
"""
bench
"""
import
os
import
shutil
import
json
from
lib
import
calculate_score
import
pytest
from
conf
import
conf
code_path
=
os
.
environ
.
get
(
'GITHUB_WORKSPACE'
)
pdf_dev_path
=
conf
.
conf
[
"pdf_dev_path"
]
pdf_res_path
=
conf
.
conf
[
"pdf_res_path"
]
class
TestBench
():
"""
test bench
"""
def
test_ci_ben
(
self
):
"""
ci benchmark
"""
fr
=
open
(
os
.
path
.
join
(
pdf_dev_path
,
"result.json"
),
"r"
,
encoding
=
"utf-8"
)
lines
=
fr
.
readlines
()
last_line
=
lines
[
-
1
]
.
strip
()
last_score
=
json
.
loads
(
last_line
)
last_simscore
=
last_score
[
"average_sim_score"
]
last_editdistance
=
last_score
[
"average_edit_distance"
]
last_bleu
=
last_score
[
"average_bleu_score"
]
os
.
system
(
f
"python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}"
)
now_score
=
get_score
()
print
(
"now_score:"
,
now_score
)
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
pdf_dev_path
,
"ci"
)):
os
.
makedirs
(
os
.
path
.
join
(
pdf_dev_path
,
"ci"
),
exist_ok
=
True
)
fw
=
open
(
os
.
path
.
join
(
pdf_dev_path
,
"ci"
,
"result.json"
),
"w+"
,
encoding
=
"utf-8"
)
fw
.
write
(
json
.
dumps
(
now_score
)
+
"
\n
"
)
now_simscore
=
now_score
[
"average_sim_score"
]
now_editdistance
=
now_score
[
"average_edit_distance"
]
now_bleu
=
now_score
[
"average_bleu_score"
]
assert
last_simscore
<=
now_simscore
assert
last_editdistance
<=
now_editdistance
assert
last_bleu
<=
now_bleu
def
get_score
():
"""
get score
"""
score
=
calculate_score
.
Scoring
(
os
.
path
.
join
(
pdf_dev_path
,
"result.json"
))
score
.
calculate_similarity_total
(
"mineru"
,
pdf_dev_path
)
res
=
score
.
summary_scores
()
return
res
tests/test_table/assets/table.jpg
0 → 100644
View file @
f9df92aa
58 KB
tests/test_table/test_tablemaster.py
0 → 100644
View file @
f9df92aa
import
unittest
from
PIL
import
Image
from
magic_pdf.model.ppTableModel
import
ppTableModel
class
TestppTableModel
(
unittest
.
TestCase
):
def
test_image2html
(
self
):
img
=
Image
.
open
(
"tests/test_table/assets/table.jpg"
)
# 修改table模型路径
config
=
{
"device"
:
"cuda"
,
"model_dir"
:
"D:/models/PDF-Extract-Kit/models/TabRec/TableMaster"
}
table_model
=
ppTableModel
(
config
)
res
=
table_model
.
img2html
(
img
)
true_value
=
"""<td><table border="1"><thead><tr><td><b>Methods</b></td><td><b>R</b></td><td><b>P</b></td><td><b>F</b></td><td><b>FPS</b></td></tr></thead><tbody><tr><td>SegLink [26]</td><td>70.0</td><td>86.0</td><td>77.0</td><td>8.9</td></tr><tr><td>PixelLink [4]</td><td>73.2</td><td>83.0</td><td>77.8</td><td>-</td></tr><tr><td>TextSnake [18]</td><td>73.9</td><td>83.2</td><td>78.3</td><td>1.1</td></tr><tr><td>TextField [37]</td><td>75.9</td><td>87.4</td><td>81.3</td><td>5.2 </td></tr><tr><td>MSR[38]</td><td>76.7</td><td>87.4</td><td>81.7</td><td>-</td></tr><tr><td>FTSN[3]</td><td>77.1</td><td>87.6</td><td>82.0</td><td>-</td></tr><tr><td>LSE[30]</td><td>81.7</td><td>84.2</td><td>82.9</td><td>-</td></tr><tr><td>CRAFT [2]</td><td>78.2</td><td>88.2</td><td>82.9</td><td>8.6</td></tr><tr><td>MCN [16]</td><td>79</td><td>88.</td><td>83</td><td>-</td></tr><tr><td>ATRR[35]</td><td>82.1</td><td>85.2</td><td>83.6</td><td>-</td></tr><tr><td>PAN [34]</td><td>83.8</td><td>84.4</td><td>84.1</td><td>30.2</td></tr><tr><td>DB[12]</td><td>79.2</td><td>91.5</td><td>84.9</td><td>32.0</td></tr><tr><td>DRRG [41]</td><td>82.30</td><td>88.05</td><td>85.08</td><td>-</td></tr><tr><td>Ours (SynText)</td><td>80.68</td><td>85.40</td><td>82.97</td><td>12.68</td></tr><tr><td>Ours (MLT-17)</td><td>84.54</td><td>86.62</td><td>85.57</td><td>12.31</td></tr></tbody></table></td>
\n
"""
self
.
assertEqual
(
true_value
,
res
)
if
__name__
==
"__main__"
:
unittest
.
main
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment