Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
06063014
Commit
06063014
authored
Jun 17, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
make paddle analyze mode adaptation cli input mode to improve analyze speed
parent
39b46ea9
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
38 additions
and
4 deletions
+38
-4
magicpdf.py
magic_pdf/cli/magicpdf.py
+7
-2
AbsPipe.py
magic_pdf/pipe/AbsPipe.py
+7
-0
OCRPipe.py
magic_pdf/pipe/OCRPipe.py
+4
-1
TXTPipe.py
magic_pdf/pipe/TXTPipe.py
+4
-0
UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+12
-1
user_api.py
magic_pdf/user_api.py
+4
-0
No files found.
magic_pdf/cli/magicpdf.py
View file @
06063014
...
...
@@ -87,6 +87,11 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
sys
.
exit
(
1
)
pipe
.
pipe_classify
()
'''如果没有传入有效的模型数据,则使用内置paddle解析'''
if
len
(
model_list
)
==
0
:
pipe
.
pipe_analyze
()
pipe
.
pipe_parse
()
pdf_info
=
pipe
.
pdf_mid_data
[
'pdf_info'
]
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
...
...
@@ -255,8 +260,8 @@ def pdf_command(pdf, model, method):
model_path
=
pdf
.
replace
(
".pdf"
,
".json"
)
if
not
os
.
path
.
exists
(
model_path
):
logger
.
warning
(
f
"not found json {model_path} existed, use paddle analyze"
)
# 本地无模型数据则调用内置paddle分析
model_json
=
json_parse
.
dumps
(
doc_analyze
(
pdf_data
,
ocr
=
False
,
show_log
=
True
))
# 本地无模型数据则调用内置paddle分析
,先传空list,在内部识别到空list再调用paddle
model_json
=
"[]"
else
:
model_json
=
read_fn
(
model_path
)
.
decode
(
"utf-8"
)
else
:
...
...
magic_pdf/pipe/AbsPipe.py
View file @
06063014
...
...
@@ -33,6 +33,13 @@ class AbsPipe(ABC):
"""
raise
NotImplementedError
@
abstractmethod
def
pipe_analyze
(
self
):
"""
有状态的跑模型分析
"""
raise
NotImplementedError
@
abstractmethod
def
pipe_parse
(
self
):
"""
...
...
magic_pdf/pipe/OCRPipe.py
View file @
06063014
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.model.doc_analyze_by_pp_structurev2
import
doc_analyze
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
from
magic_pdf.user_api
import
parse_ocr_pdf
...
...
@@ -13,6 +13,9 @@ class OCRPipe(AbsPipe):
def
pipe_classify
(
self
):
pass
def
pipe_analyze
(
self
):
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
)
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
...
...
magic_pdf/pipe/TXTPipe.py
View file @
06063014
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.model.doc_analyze_by_pp_structurev2
import
doc_analyze
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
...
...
@@ -13,6 +14,9 @@ class TXTPipe(AbsPipe):
def
pipe_classify
(
self
):
pass
def
pipe_analyze
(
self
):
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
)
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
...
...
magic_pdf/pipe/UNIPipe.py
View file @
06063014
...
...
@@ -3,6 +3,7 @@ import json
from
loguru
import
logger
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.model.doc_analyze_by_pp_structurev2
import
doc_analyze
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.libs.commons
import
join_path
...
...
@@ -15,14 +16,24 @@ class UNIPipe(AbsPipe):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
jso_useful_key
:
dict
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
):
self
.
pdf_type
=
jso_useful_key
[
"_pdf_type"
]
super
()
.
__init__
(
pdf_bytes
,
jso_useful_key
[
"model_list"
],
image_writer
,
is_debug
)
if
len
(
self
.
model_list
)
==
0
:
self
.
input_model_is_empty
=
True
else
:
self
.
input_model_is_empty
=
False
def
pipe_classify
(
self
):
self
.
pdf_type
=
AbsPipe
.
classify
(
self
.
pdf_bytes
)
def
pipe_analyze
(
self
):
if
self
.
pdf_type
==
self
.
PIP_TXT
:
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
)
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
)
def
pipe_parse
(
self
):
if
self
.
pdf_type
==
self
.
PIP_TXT
:
self
.
pdf_mid_data
=
parse_union_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
is_debug
=
self
.
is_debug
,
input_model_is_empty
=
self
.
input_model_is_empty
)
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
...
...
magic_pdf/user_api.py
View file @
06063014
...
...
@@ -16,6 +16,7 @@ import re
from
loguru
import
logger
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.model.doc_analyze_by_pp_structurev2
import
doc_analyze
from
magic_pdf.rw
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr_v2
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_txt_v2
import
parse_pdf_by_txt
...
...
@@ -65,6 +66,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
input_model_is_empty
:
bool
=
False
,
*
args
,
**
kwargs
):
"""
ocr和文本混合的pdf,全部解析出来
...
...
@@ -119,6 +121,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
or
not_printable_rate
>
0.02
# 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
):
logger
.
warning
(
f
"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr"
)
if
input_model_is_empty
:
pdf_models
=
doc_analyze
(
pdf_bytes
,
ocr
=
True
)
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
if
pdf_info_dict
is
None
:
raise
Exception
(
"Both parse_pdf_by_txt and parse_pdf_by_ocr failed."
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment