Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
683fa633
Unverified
Commit
683fa633
authored
Apr 26, 2024
by
myhloli
Committed by
GitHub
Apr 26, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #77 from myhloli/master
fix
parents
43d1d525
6e2f3097
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
27 additions
and
30 deletions
+27
-30
magicpdf.py
magic_pdf/cli/magicpdf.py
+5
-5
pdf_meta_scan.py
magic_pdf/filter/pdf_meta_scan.py
+2
-2
AbsPipe.py
magic_pdf/pipe/AbsPipe.py
+2
-3
OCRPipe.py
magic_pdf/pipe/OCRPipe.py
+6
-6
TXTPipe.py
magic_pdf/pipe/TXTPipe.py
+6
-6
UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+6
-8
No files found.
magic_pdf/cli/magicpdf.py
View file @
683fa633
...
@@ -60,11 +60,11 @@ def prepare_env(pdf_file_name, method):
...
@@ -60,11 +60,11 @@ def prepare_env(pdf_file_name, method):
def
_do_parse
(
pdf_file_name
,
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
,
local_md_dir
):
def
_do_parse
(
pdf_file_name
,
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
,
local_md_dir
):
if
parse_method
==
"auto"
:
if
parse_method
==
"auto"
:
pipe
=
UNIPipe
(
pdf_bytes
,
model_list
,
image_writer
,
i
mage_dir
,
i
s_debug
=
True
)
pipe
=
UNIPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
)
elif
parse_method
==
"txt"
:
elif
parse_method
==
"txt"
:
pipe
=
TXTPipe
(
pdf_bytes
,
model_list
,
image_writer
,
i
mage_dir
,
i
s_debug
=
True
)
pipe
=
TXTPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
)
elif
parse_method
==
"ocr"
:
elif
parse_method
==
"ocr"
:
pipe
=
OCRPipe
(
pdf_bytes
,
model_list
,
image_writer
,
i
mage_dir
,
i
s_debug
=
True
)
pipe
=
OCRPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
)
else
:
else
:
print
(
"unknow parse method"
)
print
(
"unknow parse method"
)
os
.
exit
(
1
)
os
.
exit
(
1
)
...
@@ -74,7 +74,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
...
@@ -74,7 +74,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
pdf_info
=
pipe
.
pdf_mid_data
[
'pdf_info'
]
pdf_info
=
pipe
.
pdf_mid_data
[
'pdf_info'
]
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
md_content
=
pipe
.
pipe_mk_markdown
()
md_content
=
pipe
.
pipe_mk_markdown
(
image_dir
)
#part_file_name = datetime.now().strftime("%H-%M-%S")
#part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer
.
write
(
md_writer
.
write
(
content
=
md_content
,
path
=
f
"{pdf_file_name}.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
content
=
md_content
,
path
=
f
"{pdf_file_name}.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
...
@@ -85,7 +85,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
...
@@ -85,7 +85,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
mode
=
AbsReaderWriter
.
MODE_TXT
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
)
)
try
:
try
:
content_list
=
pipe
.
pipe_mk_uni_format
()
content_list
=
pipe
.
pipe_mk_uni_format
(
image_dir
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
exception
(
e
)
logger
.
exception
(
e
)
md_writer
.
write
(
md_writer
.
write
(
...
...
magic_pdf/filter/pdf_meta_scan.py
View file @
683fa633
...
@@ -305,7 +305,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
...
@@ -305,7 +305,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
page_width_pts
,
page_height_pts
=
get_pdf_page_size_pts
(
doc
)
page_width_pts
,
page_height_pts
=
get_pdf_page_size_pts
(
doc
)
# logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
# logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
svgs_per_page
=
get_svgs_per_page
(
doc
)
#
svgs_per_page = get_svgs_per_page(doc)
# logger.info(f"svgs_per_page: {svgs_per_page}")
# logger.info(f"svgs_per_page: {svgs_per_page}")
imgs_per_page
=
get_imgs_per_page
(
doc
)
imgs_per_page
=
get_imgs_per_page
(
doc
)
# logger.info(f"imgs_per_page: {imgs_per_page}")
# logger.info(f"imgs_per_page: {imgs_per_page}")
...
@@ -331,7 +331,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
...
@@ -331,7 +331,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
"text_len_per_page"
:
text_len_per_page
,
"text_len_per_page"
:
text_len_per_page
,
"text_layout_per_page"
:
text_layout_per_page
,
"text_layout_per_page"
:
text_layout_per_page
,
"text_language"
:
text_language
,
"text_language"
:
text_language
,
"svgs_per_page"
:
svgs_per_page
,
#
"svgs_per_page": svgs_per_page,
"imgs_per_page"
:
imgs_per_page
,
# 增加每页img数量list
"imgs_per_page"
:
imgs_per_page
,
# 增加每页img数量list
"junk_img_bojids"
:
junk_img_bojids
,
# 增加垃圾图片的bojid list
"junk_img_bojids"
:
junk_img_bojids
,
# 增加垃圾图片的bojid list
"metadata"
:
doc
.
metadata
"metadata"
:
doc
.
metadata
...
...
magic_pdf/pipe/AbsPipe.py
View file @
683fa633
...
@@ -16,11 +16,10 @@ class AbsPipe(ABC):
...
@@ -16,11 +16,10 @@ class AbsPipe(ABC):
PIP_OCR
=
"ocr"
PIP_OCR
=
"ocr"
PIP_TXT
=
"txt"
PIP_TXT
=
"txt"
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
i
mg_parent_path
:
str
,
is_debug
:
bool
=
False
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
i
s_debug
:
bool
=
False
):
self
.
pdf_bytes
=
pdf_bytes
self
.
pdf_bytes
=
pdf_bytes
self
.
model_list
=
model_list
self
.
model_list
=
model_list
self
.
image_writer
=
image_writer
self
.
image_writer
=
image_writer
self
.
img_parent_path
=
img_parent_path
self
.
pdf_mid_data
=
None
# 未压缩
self
.
pdf_mid_data
=
None
# 未压缩
self
.
is_debug
=
is_debug
self
.
is_debug
=
is_debug
...
...
magic_pdf/pipe/OCRPipe.py
View file @
683fa633
...
@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_ocr_pdf
...
@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_ocr_pdf
class
OCRPipe
(
AbsPipe
):
class
OCRPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
i
mg_parent_path
:
str
,
is_debug
:
bool
=
False
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
i
s_debug
:
bool
=
False
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
i
mg_parent_path
,
i
s_debug
)
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
)
def
pipe_classify
(
self
):
def
pipe_classify
(
self
):
pass
pass
...
@@ -15,10 +15,10 @@ class OCRPipe(AbsPipe):
...
@@ -15,10 +15,10 @@ class OCRPipe(AbsPipe):
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
def
pipe_mk_uni_format
(
self
):
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
):
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
)
return
content_list
return
content_list
def
pipe_mk_markdown
(
self
):
def
pipe_mk_markdown
(
self
,
img_parent_path
:
str
):
md_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
md_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
)
return
md_content
return
md_content
magic_pdf/pipe/TXTPipe.py
View file @
683fa633
...
@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_txt_pdf
...
@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_txt_pdf
class
TXTPipe
(
AbsPipe
):
class
TXTPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
i
mg_parent_path
:
str
,
is_debug
:
bool
=
False
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
i
s_debug
:
bool
=
False
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
i
mg_parent_path
,
i
s_debug
)
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
)
def
pipe_classify
(
self
):
def
pipe_classify
(
self
):
pass
pass
...
@@ -15,10 +15,10 @@ class TXTPipe(AbsPipe):
...
@@ -15,10 +15,10 @@ class TXTPipe(AbsPipe):
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
def
pipe_mk_uni_format
(
self
):
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
):
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
)
return
content_list
return
content_list
def
pipe_mk_markdown
(
self
):
def
pipe_mk_markdown
(
self
,
img_parent_path
:
str
):
md_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
md_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
)
return
md_content
return
md_content
magic_pdf/pipe/UNIPipe.py
View file @
683fa633
...
@@ -10,10 +10,8 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
...
@@ -10,10 +10,8 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class
UNIPipe
(
AbsPipe
):
class
UNIPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
,
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
):
is_debug
:
bool
=
False
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
)
self
.
pdf_type
=
self
.
PIP_OCR
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
img_parent_path
,
is_debug
)
def
pipe_classify
(
self
):
def
pipe_classify
(
self
):
self
.
pdf_type
=
UNIPipe
.
classify
(
self
.
pdf_bytes
)
self
.
pdf_type
=
UNIPipe
.
classify
(
self
.
pdf_bytes
)
...
@@ -26,12 +24,12 @@ class UNIPipe(AbsPipe):
...
@@ -26,12 +24,12 @@ class UNIPipe(AbsPipe):
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
is_debug
=
self
.
is_debug
)
def
pipe_mk_uni_format
(
self
):
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
):
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
)
return
content_list
return
content_list
def
pipe_mk_markdown
(
self
):
def
pipe_mk_markdown
(
self
,
img_parent_path
:
str
):
markdown_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
markdown_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
)
return
markdown_content
return
markdown_content
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment