Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
ffc20db7
Commit
ffc20db7
authored
Apr 16, 2024
by
kernel.h@qq.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改包引用问题
parent
31f3d4cc
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
7 additions
and
7 deletions
+7
-7
magicpdf.py
magic_pdf/cli/magicpdf.py
+2
-2
AbsPipe.py
magic_pdf/pipe/AbsPipe.py
+1
-1
OCRPipe.py
magic_pdf/pipe/OCRPipe.py
+1
-1
TXTPipe.py
magic_pdf/pipe/TXTPipe.py
+1
-1
UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+2
-2
No files found.
magic_pdf/cli/magicpdf.py
View file @
ffc20db7
...
@@ -54,7 +54,7 @@ def prepare_env():
...
@@ -54,7 +54,7 @@ def prepare_env():
def
_do_parse
(
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
def
_do_parse
(
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
uni_pipe
=
UNIPipe
()
uni_pipe
=
UNIPipe
(
pdf_bytes
,
model_list
,
image_writer
,
image_dir
)
jso_useful_key
=
{
jso_useful_key
=
{
"_pdf_type"
:
"txt"
,
"_pdf_type"
:
"txt"
,
"model_list"
:
model_list
,
"model_list"
:
model_list
,
...
@@ -62,7 +62,7 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
...
@@ -62,7 +62,7 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
if
parse_method
==
"ocr"
:
if
parse_method
==
"ocr"
:
jso_useful_key
[
"_pdf_type"
]
=
"ocr"
jso_useful_key
[
"_pdf_type"
]
=
"ocr"
pdf_mid_data
=
uni_pipe
.
p
arse
(
pdf_bytes
,
image_writer
,
jso_useful_key
)
pdf_mid_data
=
uni_pipe
.
p
ipe_parse
(
)
md_content
=
UNIPipe
.
mk_markdown
(
pdf_mid_data
,
image_dir
)
md_content
=
UNIPipe
.
mk_markdown
(
pdf_mid_data
,
image_dir
)
part_file_name
=
datetime
.
now
()
.
strftime
(
"
%
H-
%
M-
%
S"
)
part_file_name
=
datetime
.
now
()
.
strftime
(
"
%
H-
%
M-
%
S"
)
md_writer
.
write
(
content
=
md_content
,
path
=
f
"{part_file_name}.md"
,
mode
=
MODE_TXT
)
md_writer
.
write
(
content
=
md_content
,
path
=
f
"{part_file_name}.md"
,
mode
=
MODE_TXT
)
...
...
magic_pdf/pipe/AbsPipe.py
View file @
ffc20db7
...
@@ -4,7 +4,7 @@ from magic_pdf.dict2md.mkcontent import mk_universal_format, mk_mm_markdown
...
@@ -4,7 +4,7 @@ from magic_pdf.dict2md.mkcontent import mk_universal_format, mk_mm_markdown
from
magic_pdf.dict2md.ocr_mkcontent
import
make_standard_format_with_para
,
ocr_mk_mm_markdown_with_para
from
magic_pdf.dict2md.ocr_mkcontent
import
make_standard_format_with_para
,
ocr_mk_mm_markdown_with_para
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
magic_pdf.
io
.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.
rw
.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.libs.json_compressor
import
JsonCompressor
...
...
magic_pdf/pipe/OCRPipe.py
View file @
ffc20db7
from
magic_pdf.
io
.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.
rw
.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
from
magic_pdf.user_api
import
parse_ocr_pdf
from
magic_pdf.user_api
import
parse_ocr_pdf
...
...
magic_pdf/pipe/TXTPipe.py
View file @
ffc20db7
from
magic_pdf.
io
.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.
rw
.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
from
magic_pdf.user_api
import
parse_txt_pdf
from
magic_pdf.user_api
import
parse_txt_pdf
...
...
magic_pdf/pipe/UNIPipe.py
View file @
ffc20db7
...
@@ -29,9 +29,9 @@ class UNIPipe(AbsPipe):
...
@@ -29,9 +29,9 @@ class UNIPipe(AbsPipe):
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
if
self
.
pdf_type
==
"txt"
:
if
self
.
pdf_type
==
"txt"
:
self
.
pdf_mid_data
=
parse_union_pdf
(
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
)
self
.
pdf_mid_data
=
parse_union_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
)
elif
self
.
pdf_type
==
"ocr"
:
elif
self
.
pdf_type
==
"ocr"
:
self
.
pdf_mid_data
=
parse_ocr_pdf
(
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
)
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
)
self
.
compressed_pdf_mid_data
=
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
self
.
compressed_pdf_mid_data
=
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
def
pipe_mk_uni_format
(
self
):
def
pipe_mk_uni_format
(
self
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment