Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
aad5652c
Commit
aad5652c
authored
Jun 26, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update: fix cli and inside model used logic
parent
3aa8ccdc
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
24 additions
and
12 deletions
+24
-12
magicpdf.py
magic_pdf/cli/magicpdf.py
+8
-10
__init__.py
magic_pdf/model/__init__.py
+1
-0
doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+9
-1
pp_structure_v2.py
magic_pdf/model/pp_structure_v2.py
+6
-1
No files found.
magic_pdf/cli/magicpdf.py
View file @
aad5652c
...
@@ -23,7 +23,6 @@ python magicpdf.py pdf-command --pdf /home/llm/Downloads/xxxx.pdf --model /home
...
@@ -23,7 +23,6 @@ python magicpdf.py pdf-command --pdf /home/llm/Downloads/xxxx.pdf --model /home
import
os
import
os
import
json
as
json_parse
import
json
as
json_parse
import
sys
import
click
import
click
from
loguru
import
logger
from
loguru
import
logger
from
pathlib
import
Path
from
pathlib
import
Path
...
@@ -46,9 +45,9 @@ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
...
@@ -46,9 +45,9 @@ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
import
csv
import
csv
import
copy
import
copy
import
magic_pdf.model
as
model_config
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
use_inside_model
=
False
def
prepare_env
(
pdf_file_name
,
method
):
def
prepare_env
(
pdf_file_name
,
method
):
...
@@ -67,7 +66,7 @@ def write_to_csv(csv_file_path, csv_data):
...
@@ -67,7 +66,7 @@ def write_to_csv(csv_file_path, csv_data):
csv_writer
=
csv
.
writer
(
csvfile
)
csv_writer
=
csv
.
writer
(
csvfile
)
# 写入数据
# 写入数据
csv_writer
.
writerow
(
csv_data
)
csv_writer
.
writerow
(
csv_data
)
print
(
f
"数据已成功追加到 '{csv_file_path}'"
)
logger
.
info
(
f
"数据已成功追加到 '{csv_file_path}'"
)
def
do_parse
(
def
do_parse
(
...
@@ -98,17 +97,17 @@ def do_parse(
...
@@ -98,17 +97,17 @@ def do_parse(
pipe
=
OCRPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
)
pipe
=
OCRPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
)
else
:
else
:
logger
.
error
(
"unknown parse method"
)
logger
.
error
(
"unknown parse method"
)
sys
.
exit
(
1
)
exit
(
1
)
pipe
.
pipe_classify
()
pipe
.
pipe_classify
()
"""如果没有传入有效的模型数据,则使用内置model解析"""
"""如果没有传入有效的模型数据,则使用内置model解析"""
if
len
(
model_list
)
==
0
:
if
len
(
model_list
)
==
0
:
if
use_inside_model
:
if
model_config
.
__use_inside_model__
:
pipe
.
pipe_analyze
()
pipe
.
pipe_analyze
()
else
:
else
:
logger
.
error
(
"need model list input"
)
logger
.
error
(
"need model list input"
)
sys
.
exit
(
1
)
exit
(
1
)
pipe
.
pipe_parse
()
pipe
.
pipe_parse
()
pdf_info
=
pipe
.
pdf_mid_data
[
"pdf_info"
]
pdf_info
=
pipe
.
pdf_mid_data
[
"pdf_info"
]
...
@@ -177,8 +176,8 @@ def cli():
...
@@ -177,8 +176,8 @@ def cli():
)
)
def
json_command
(
json
,
method
):
def
json_command
(
json
,
method
):
if
not
json
.
startswith
(
"s3://"
):
if
not
json
.
startswith
(
"s3://"
):
print
(
"usage: python magipdf.py
--json s3://some_bucket/some_path"
)
logger
.
error
(
"usage: magic-pdf json-command
--json s3://some_bucket/some_path"
)
sys
.
exit
(
1
)
exit
(
1
)
def
read_s3_path
(
s3path
):
def
read_s3_path
(
s3path
):
bucket
,
key
=
parse_s3path
(
s3path
)
bucket
,
key
=
parse_s3path
(
s3path
)
...
@@ -274,8 +273,7 @@ def local_json_command(local_json, method):
...
@@ -274,8 +273,7 @@ def local_json_command(local_json, method):
)
)
@
click
.
option
(
"--inside_model"
,
type
=
click
.
BOOL
,
default
=
False
,
help
=
"使用内置模型测试"
)
@
click
.
option
(
"--inside_model"
,
type
=
click
.
BOOL
,
default
=
False
,
help
=
"使用内置模型测试"
)
def
pdf_command
(
pdf
,
model
,
method
,
inside_model
):
def
pdf_command
(
pdf
,
model
,
method
,
inside_model
):
global
use_inside_model
model_config
.
__use_inside_model__
=
inside_model
use_inside_model
=
inside_model
def
read_fn
(
path
):
def
read_fn
(
path
):
disk_rw
=
DiskReaderWriter
(
os
.
path
.
dirname
(
path
))
disk_rw
=
DiskReaderWriter
(
os
.
path
.
dirname
(
path
))
...
...
magic_pdf/model/__init__.py
View file @
aad5652c
__use_inside_model__
=
False
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
aad5652c
...
@@ -2,9 +2,10 @@ import fitz
...
@@ -2,9 +2,10 @@ import fitz
import
cv2
import
cv2
from
PIL
import
Image
from
PIL
import
Image
import
numpy
as
np
import
numpy
as
np
from
loguru
import
logger
from
magic_pdf.model.model_list
import
MODEL
from
magic_pdf.model.model_list
import
MODEL
from
magic_pdf.model.pp_structure_v2
import
CustomPaddleModel
import
magic_pdf.model
as
model_config
def
dict_compare
(
d1
,
d2
):
def
dict_compare
(
d1
,
d2
):
...
@@ -41,6 +42,13 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
...
@@ -41,6 +42,13 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
def
doc_analyze
(
pdf_bytes
:
bytes
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
,
model
=
MODEL
.
Paddle
):
def
doc_analyze
(
pdf_bytes
:
bytes
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
,
model
=
MODEL
.
Paddle
):
if
model_config
.
__use_inside_model__
:
from
magic_pdf.model.pp_structure_v2
import
CustomPaddleModel
else
:
logger
.
error
(
"use_inside_model is False, not allow to use inside model"
)
exit
(
1
)
images
=
load_images_from_pdf
(
pdf_bytes
)
images
=
load_images_from_pdf
(
pdf_bytes
)
custom_model
=
None
custom_model
=
None
if
model
==
MODEL
.
Paddle
:
if
model
==
MODEL
.
Paddle
:
...
...
magic_pdf/model/pp_structure_v2.py
View file @
aad5652c
import
random
import
random
from
loguru
import
logger
from
loguru
import
logger
from
paddleocr
import
PPStructure
try
:
from
paddleocr
import
PPStructure
except
ImportError
:
logger
.
warning
(
'paddleocr not installed, please install by "pip install magic-pdf[cpu]" or "pip install magic-pdf[gpu]"'
)
exit
(
1
)
def
region_to_bbox
(
region
):
def
region_to_bbox
(
region
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment