Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
20937b09
Unverified
Commit
20937b09
authored
Sep 12, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Sep 12, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #607 from myhloli/dev
fix: solve conflicts
parents
31a1ecdb
a4c72e2e
Changes
18
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
0 additions
and
323 deletions
+0
-323
LICENSE.md
LICENSE.md
+0
-3
README.md
README.md
+0
-5
README_zh-CN.md
README_zh-CN.md
+0
-9
app.py
app.py
+0
-167
version.py
magic_pdf/libs/version.py
+0
-4
doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+0
-4
pdf_extract_kit.py
magic_pdf/model/pdf_extract_kit.py
+0
-13
AbsPipe.py
magic_pdf/pipe/AbsPipe.py
+0
-7
OCRPipe.py
magic_pdf/pipe/OCRPipe.py
+0
-9
TXTPipe.py
magic_pdf/pipe/TXTPipe.py
+0
-9
UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+0
-17
model_configs.yaml
magic_pdf/resources/model_config/model_configs.yaml
+0
-4
cli.py
magic_pdf/tools/cli.py
+0
-10
user_api.py
magic_pdf/user_api.py
+0
-10
mv_pdf.py
mv_pdf.py
+0
-26
README.md
projects/README.md
+0
-3
README_zh-CN.md
projects/README_zh-CN.md
+0
-3
README_zh-CN.md
projects/llama_index_rag/README_zh-CN.md
+0
-20
No files found.
LICENSE.md
View file @
20937b09
...
...
@@ -660,6 +660,3 @@ if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU AGPL, see
<https://www.gnu.org/licenses/>
.
$^1$
\ No newline at end of file
README.md
View file @
20937b09
...
...
@@ -14,14 +14,9 @@
[

](https://pepy.tech/project/magic-pdf)
[

](https://pepy.tech/project/magic-pdf)
<<<<<<< HEAD
[

](https://huggingface.co/spaces/opendatalab/MinerU)
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
=======
[

](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
[

](https://huggingface.co/spaces/opendatalab/MinerU)
[

](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
[

](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
[

](#)
...
...
README_zh-CN.md
View file @
20937b09
...
...
@@ -14,14 +14,9 @@
[

](https://pepy.tech/project/magic-pdf)
[

](https://pepy.tech/project/magic-pdf)
<<<<<<< HEAD
[

](https://huggingface.co/spaces/opendatalab/MinerU)
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
=======
[

](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
[

](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
[

](https://huggingface.co/spaces/opendatalab/MinerU)
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
[

](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
[

](#)
...
...
@@ -185,13 +180,9 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
### 在线体验
<<<<<<< HEAD
[在线体验点击这里](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
=======
[

](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
[

](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
[

](https://huggingface.co/spaces/opendatalab/MinerU)
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
### 使用CPU快速体验
...
...
app.py
deleted
100644 → 0
View file @
31a1ecdb
# Copyright (c) Opendatalab. All rights reserved.
import
base64
import
os
import
time
import
zipfile
from
pathlib
import
Path
import
re
from
loguru
import
logger
from
magic_pdf.libs.hash_utils
import
compute_sha256
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.tools.common
import
do_parse
,
prepare_env
os
.
system
(
"pip install gradio"
)
os
.
system
(
"pip install gradio-pdf"
)
import
gradio
as
gr
from
gradio_pdf
import
PDF
def
read_fn
(
path
):
disk_rw
=
DiskReaderWriter
(
os
.
path
.
dirname
(
path
))
return
disk_rw
.
read
(
os
.
path
.
basename
(
path
),
AbsReaderWriter
.
MODE_BIN
)
def
parse_pdf
(
doc_path
,
output_dir
,
end_page_id
):
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
try
:
file_name
=
f
"{str(Path(doc_path).stem)}_{time.time()}"
pdf_data
=
read_fn
(
doc_path
)
parse_method
=
"auto"
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
file_name
,
parse_method
)
do_parse
(
output_dir
,
file_name
,
pdf_data
,
[],
parse_method
,
False
,
end_page_id
=
end_page_id
,
)
return
local_md_dir
,
file_name
except
Exception
as
e
:
logger
.
exception
(
e
)
def
compress_directory_to_zip
(
directory_path
,
output_zip_path
):
"""
压缩指定目录到一个 ZIP 文件。
:param directory_path: 要压缩的目录路径
:param output_zip_path: 输出的 ZIP 文件路径
"""
try
:
with
zipfile
.
ZipFile
(
output_zip_path
,
'w'
,
zipfile
.
ZIP_DEFLATED
)
as
zipf
:
# 遍历目录中的所有文件和子目录
for
root
,
dirs
,
files
in
os
.
walk
(
directory_path
):
for
file
in
files
:
# 构建完整的文件路径
file_path
=
os
.
path
.
join
(
root
,
file
)
# 计算相对路径
arcname
=
os
.
path
.
relpath
(
file_path
,
directory_path
)
# 添加文件到 ZIP 文件
zipf
.
write
(
file_path
,
arcname
)
return
0
except
Exception
as
e
:
logger
.
exception
(
e
)
return
-
1
def
image_to_base64
(
image_path
):
with
open
(
image_path
,
"rb"
)
as
image_file
:
return
base64
.
b64encode
(
image_file
.
read
())
.
decode
(
'utf-8'
)
def
replace_image_with_base64
(
markdown_text
,
image_dir_path
):
# 匹配Markdown中的图片标签
pattern
=
r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
# 替换图片链接
def
replace
(
match
):
relative_path
=
match
.
group
(
1
)
full_path
=
os
.
path
.
join
(
image_dir_path
,
relative_path
)
base64_image
=
image_to_base64
(
full_path
)
return
f
""
# 应用替换
return
re
.
sub
(
pattern
,
replace
,
markdown_text
)
def
to_markdown
(
file_path
,
end_pages
):
# 获取识别的md文件以及压缩包文件路径
local_md_dir
,
file_name
=
parse_pdf
(
file_path
,
'./output'
,
end_pages
-
1
)
archive_zip_path
=
os
.
path
.
join
(
"./output"
,
compute_sha256
(
local_md_dir
)
+
".zip"
)
zip_archive_success
=
compress_directory_to_zip
(
local_md_dir
,
archive_zip_path
)
if
zip_archive_success
==
0
:
logger
.
info
(
"压缩成功"
)
else
:
logger
.
error
(
"压缩失败"
)
md_path
=
os
.
path
.
join
(
local_md_dir
,
file_name
+
".md"
)
with
open
(
md_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
txt_content
=
f
.
read
()
md_content
=
replace_image_with_base64
(
txt_content
,
local_md_dir
)
# 返回转换后的PDF路径
new_pdf_path
=
os
.
path
.
join
(
local_md_dir
,
file_name
+
"_layout.pdf"
)
return
md_content
,
txt_content
,
archive_zip_path
,
new_pdf_path
# def show_pdf(file_path):
# with open(file_path, "rb") as f:
# base64_pdf = base64.b64encode(f.read()).decode('utf-8')
# pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" ' \
# f'width="100%" height="1000" type="application/pdf">'
# return pdf_display
latex_delimiters
=
[{
"left"
:
"$$"
,
"right"
:
"$$"
,
"display"
:
True
},
{
"left"
:
'$'
,
"right"
:
'$'
,
"display"
:
False
}]
def
init_model
():
from
magic_pdf.model.doc_analyze_by_custom_model
import
ModelSingleton
try
:
model_manager
=
ModelSingleton
()
txt_model
=
model_manager
.
get_model
(
False
,
False
)
logger
.
info
(
f
"txt_model init final"
)
ocr_model
=
model_manager
.
get_model
(
True
,
False
)
logger
.
info
(
f
"ocr_model init final"
)
return
0
except
Exception
as
e
:
logger
.
exception
(
e
)
return
-
1
model_init
=
init_model
()
logger
.
info
(
f
"model_init: {model_init}"
)
if
__name__
==
"__main__"
:
with
gr
.
Blocks
()
as
demo
:
with
gr
.
Row
():
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
pdf_show
=
gr
.
Markdown
()
max_pages
=
gr
.
Slider
(
1
,
10
,
5
,
step
=
1
,
label
=
"Max convert pages"
)
with
gr
.
Row
()
as
bu_flow
:
change_bu
=
gr
.
Button
(
"Convert"
)
clear_bu
=
gr
.
ClearButton
([
pdf_show
],
value
=
"Clear"
)
pdf_show
=
PDF
(
label
=
"Please upload pdf"
,
interactive
=
True
,
height
=
800
)
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
output_file
=
gr
.
File
(
label
=
"convert result"
,
interactive
=
False
)
with
gr
.
Tabs
():
with
gr
.
Tab
(
"Markdown rendering"
):
md
=
gr
.
Markdown
(
label
=
"Markdown rendering"
,
height
=
900
,
show_copy_button
=
True
,
latex_delimiters
=
latex_delimiters
,
line_breaks
=
True
)
with
gr
.
Tab
(
"Markdown text"
):
md_text
=
gr
.
TextArea
(
lines
=
45
,
show_copy_button
=
True
)
change_bu
.
click
(
fn
=
to_markdown
,
inputs
=
[
pdf_show
,
max_pages
],
outputs
=
[
md
,
md_text
,
output_file
,
pdf_show
])
clear_bu
.
add
([
md
,
pdf_show
,
md_text
,
output_file
])
demo
.
launch
()
magic_pdf/libs/version.py
View file @
20937b09
<<<<<<<
HEAD
__version__
=
"0.7.1"
=======
__version__
=
"0.8.0"
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
20937b09
...
...
@@ -106,11 +106,7 @@ def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None):
def
doc_analyze
(
pdf_bytes
:
bytes
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
,
<<<<<<<
HEAD
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
=======
start_page_id
=
0
,
end_page_id
=
None
):
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
model_manager
=
ModelSingleton
()
custom_model
=
model_manager
.
get_model
(
ocr
,
show_log
,
lang
)
...
...
magic_pdf/model/pdf_extract_kit.py
View file @
20937b09
...
...
@@ -74,16 +74,11 @@ def layout_model_init(weight, config_file, device):
return
model
<<<<<<<
HEAD
def
ocr_model_init
(
show_log
:
bool
=
False
,
det_db_box_thresh
=
0.3
,
lang
=
None
):
if
lang
is
not
None
:
model
=
ModifiedPaddleOCR
(
show_log
=
show_log
,
det_db_box_thresh
=
det_db_box_thresh
,
lang
=
lang
)
else
:
model
=
ModifiedPaddleOCR
(
show_log
=
show_log
,
det_db_box_thresh
=
det_db_box_thresh
)
=======
def
ocr_model_init
(
show_log
:
bool
=
False
,
det_db_box_thresh
=
0.3
):
model
=
ModifiedPaddleOCR
(
show_log
=
show_log
,
det_db_box_thresh
=
det_db_box_thresh
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
return
model
...
...
@@ -142,12 +137,8 @@ def atom_model_init(model_name: str, **kwargs):
elif
model_name
==
AtomicModel
.
OCR
:
atom_model
=
ocr_model_init
(
kwargs
.
get
(
"ocr_show_log"
),
<<<<<<<
HEAD
kwargs
.
get
(
"det_db_box_thresh"
),
kwargs
.
get
(
"lang"
)
=======
kwargs
.
get
(
"det_db_box_thresh"
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
)
elif
model_name
==
AtomicModel
.
Table
:
atom_model
=
table_model_init
(
...
...
@@ -244,12 +235,8 @@ class CustomPEKModel:
self
.
ocr_model
=
atom_model_manager
.
get_atom_model
(
atom_model_name
=
AtomicModel
.
OCR
,
ocr_show_log
=
show_log
,
<<<<<<<
HEAD
det_db_box_thresh
=
0.3
,
lang
=
self
.
lang
=======
det_db_box_thresh
=
0.3
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
)
# init table model
if
self
.
apply_table
:
...
...
magic_pdf/pipe/AbsPipe.py
View file @
20937b09
...
...
@@ -17,11 +17,7 @@ class AbsPipe(ABC):
PIP_TXT
=
"txt"
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
<<<<<<<
HEAD
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
=======
start_page_id
=
0
,
end_page_id
=
None
):
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
self
.
pdf_bytes
=
pdf_bytes
self
.
model_list
=
model_list
self
.
image_writer
=
image_writer
...
...
@@ -29,10 +25,7 @@ class AbsPipe(ABC):
self
.
is_debug
=
is_debug
self
.
start_page_id
=
start_page_id
self
.
end_page_id
=
end_page_id
<<<<<<<
HEAD
self
.
lang
=
lang
=======
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
def
get_compress_pdf_mid_data
(
self
):
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
...
...
magic_pdf/pipe/OCRPipe.py
View file @
20937b09
...
...
@@ -10,25 +10,16 @@ from magic_pdf.user_api import parse_ocr_pdf
class
OCRPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
<<<<<<<
HEAD
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
=======
start_page_id
=
0
,
end_page_id
=
None
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
def
pipe_classify
(
self
):
pass
def
pipe_analyze
(
self
):
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
<<<<<<<
HEAD
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
=======
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
...
...
magic_pdf/pipe/TXTPipe.py
View file @
20937b09
...
...
@@ -11,25 +11,16 @@ from magic_pdf.user_api import parse_txt_pdf
class
TXTPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
<<<<<<<
HEAD
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
=======
start_page_id
=
0
,
end_page_id
=
None
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
def
pipe_classify
(
self
):
pass
def
pipe_analyze
(
self
):
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
,
<<<<<<<
HEAD
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
=======
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
...
...
magic_pdf/pipe/UNIPipe.py
View file @
20937b09
...
...
@@ -14,15 +14,9 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class
UNIPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
jso_useful_key
:
dict
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
<<<<<<<
HEAD
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
self
.
pdf_type
=
jso_useful_key
[
"_pdf_type"
]
super
()
.
__init__
(
pdf_bytes
,
jso_useful_key
[
"model_list"
],
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
=======
start_page_id
=
0
,
end_page_id
=
None
):
self
.
pdf_type
=
jso_useful_key
[
"_pdf_type"
]
super
()
.
__init__
(
pdf_bytes
,
jso_useful_key
[
"model_list"
],
image_writer
,
is_debug
,
start_page_id
,
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
if
len
(
self
.
model_list
)
==
0
:
self
.
input_model_is_empty
=
True
else
:
...
...
@@ -34,30 +28,19 @@ class UNIPipe(AbsPipe):
def
pipe_analyze
(
self
):
if
self
.
pdf_type
==
self
.
PIP_TXT
:
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
,
<<<<<<<
HEAD
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
=======
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
def
pipe_parse
(
self
):
if
self
.
pdf_type
==
self
.
PIP_TXT
:
self
.
pdf_mid_data
=
parse_union_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
input_model_is_empty
=
self
.
input_model_is_empty
,
<<<<<<<
HEAD
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
=======
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
...
...
magic_pdf/resources/model_config/model_configs.yaml
View file @
20937b09
...
...
@@ -10,10 +10,6 @@ config:
weights
:
layout
:
Layout/model_final.pth
mfd
:
MFD/weights.pt
<<<<<<< HEAD
mfr
:
MFR/unimernet_base
=======
mfr
:
MFR/UniMERNet
>
>>>>>>
0140d7d271ac3b1561ca2272030e9e038b469999
struct_eqtable
:
TabRec/StructEqTable
TableMaster
:
TabRec/TableMaster
\ No newline at end of file
magic_pdf/tools/cli.py
View file @
20937b09
...
...
@@ -45,7 +45,6 @@ without method specified, auto will be used by default.""",
default
=
'auto'
,
)
@
click
.
option
(
<<<<<<<
HEAD
'-l'
,
'--lang'
,
'lang'
,
...
...
@@ -58,8 +57,6 @@ without method specified, auto will be used by default.""",
default
=
None
,
)
@
click
.
option
(
=======
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
'-d'
,
'--debug'
,
'debug_able'
,
...
...
@@ -83,11 +80,7 @@ without method specified, auto will be used by default.""",
help
=
'The ending page for PDF parsing, beginning from 0.'
,
default
=
None
,
)
<<<<<<<
HEAD
def
cli
(
path
,
output_dir
,
method
,
lang
,
debug_able
,
start_page_id
,
end_page_id
):
=======
def
cli
(
path
,
output_dir
,
method
,
debug_able
,
start_page_id
,
end_page_id
):
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
model_config
.
__use_inside_model__
=
True
model_config
.
__model_mode__
=
'full'
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
...
...
@@ -109,10 +102,7 @@ def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
debug_able
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
<<<<<<<
HEAD
lang
=
lang
=======
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
)
except
Exception
as
e
:
...
...
magic_pdf/user_api.py
View file @
20937b09
...
...
@@ -71,11 +71,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
input_model_is_empty
:
bool
=
False
,
<<<<<<<
HEAD
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
=======
start_page_id
=
0
,
end_page_id
=
None
,
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
*
args
,
**
kwargs
):
"""
ocr和文本混合的pdf,全部解析出来
...
...
@@ -99,17 +95,11 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"_need_drop"
,
False
):
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
if
input_model_is_empty
:
<<<<<<<
HEAD
pdf_models
=
doc_analyze
(
pdf_bytes
,
ocr
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
=======
pdf_models
=
doc_analyze
(
pdf_bytes
,
ocr
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
)
>>>>>>>
0140
d7d271ac3b1561ca2272030e9e038b469999
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
if
pdf_info_dict
is
None
:
raise
Exception
(
"Both parse_pdf_by_txt and parse_pdf_by_ocr failed."
)
...
...
mv_pdf.py
deleted
100644 → 0
View file @
31a1ecdb
import
os
import
shutil
def
move_pdfs
(
root_folder
,
destination_folder
):
# 遍历根目录及其子目录中的所有文件
for
root
,
dirs
,
files
in
os
.
walk
(
root_folder
):
for
file
in
files
:
if
file
.
endswith
(
'.pdf'
):
# 构建完整的文件路径
src_path
=
os
.
path
.
join
(
root
,
file
)
# 构建目标路径
dst_path
=
os
.
path
.
join
(
destination_folder
,
file
)
# 移动文件
shutil
.
move
(
src_path
,
dst_path
)
print
(
f
'Moved {file} to {destination_folder}'
)
# 使用方法
root_folder
=
r'D:\mineru\datasets\datasets'
# 源文件夹路径
destination_folder
=
r'D:\mineru\datasets\pdf'
# 目标文件夹路径
# 创建目标文件夹如果不存在
if
not
os
.
path
.
exists
(
destination_folder
):
os
.
makedirs
(
destination_folder
)
move_pdfs
(
root_folder
,
destination_folder
)
\ No newline at end of file
projects/README.md
View file @
20937b09
...
...
@@ -3,9 +3,6 @@
## Project List
-
[
llama_index_rag
](
./llama_index_rag/README.md
)
: Build a lightweight RAG system based on llama_index
<<<<<<< HEAD
-
[
gradio_app
](
./gradio_app/README.md
)
: Build a web app based on gradio
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
projects/README_zh-CN.md
View file @
20937b09
...
...
@@ -3,8 +3,5 @@
## 项目列表
-
[
llama_index_rag
](
./llama_index_rag/README_zh-CN.md
)
: 基于 llama_index 构建轻量级 RAG 系统
<<<<<<< HEAD
-
[
gradio_app
](
./gradio_app/README_zh-CN.md
)
: 基于 Gradio 的 Web 应用
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
projects/llama_index_rag/README_zh-CN.md
View file @
20937b09
...
...
@@ -59,10 +59,6 @@ Server: Docker Engine - Community
```
bash
# install
pip
install
modelscope
==
1.14.0
<<<<<<
< HEAD
=======
>>>>>>>
0140d7d271ac3b1561ca2272030e9e038b469999
pip
install
llama-index-vector-stores-elasticsearch
==
0.2.0
pip
install
llama-index-embeddings-dashscope
==
0.2.0
pip
install
llama-index-core
==
0.10.68
...
...
@@ -74,19 +70,12 @@ pip install accelerate==0.33.0
pip uninstall transformer-engine
```
<<<<<<< HEAD
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
## 示例
```
`bash
cd projects/llama_index_rag
<<<<<<< HEAD
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
docker compose up -d
or
...
...
@@ -94,20 +83,14 @@ or
docker-compose up -d
<<<<<<< HEAD
# 配置环境变量
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
export ES_USER=elastic
export ES_PASSWORD=llama_index
export ES_URL=http://127.0.0.1:9200
export DASHSCOPE_API_KEY={some_key}
<<<<<<< HEAD
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
DASHSCOPE_API_KEY 开通参考[文档](https://help.aliyun.com/zh/dashscope/opening-service)
# 未导入数据,查询问题。返回通义千问默认答案
...
...
@@ -135,10 +118,7 @@ python data_ingestion.py -p example/data/declaration_of_the_rights_of_man_1789.p
# 导入数据后,查询问题。通义千问模型会根据 RAG 系统的检索结果,结合上下文,给出答案。
<<<<<<< HEAD
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
python query.py -q 'how about the rights of men'
## outputs
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment