Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
4bd31ced
Commit
4bd31ced
authored
Apr 30, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
change content make logic to union_make
parent
6bcd4c31
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
28 additions
and
33 deletions
+28
-33
magicpdf.py
magic_pdf/cli/magicpdf.py
+4
-3
AbsPipe.py
magic_pdf/pipe/AbsPipe.py
+8
-18
OCRPipe.py
magic_pdf/pipe/OCRPipe.py
+5
-4
TXTPipe.py
magic_pdf/pipe/TXTPipe.py
+5
-4
UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+6
-4
No files found.
magic_pdf/cli/magicpdf.py
View file @
4bd31ced
...
@@ -28,6 +28,7 @@ import click
...
@@ -28,6 +28,7 @@ import click
from
loguru
import
logger
from
loguru
import
logger
from
pathlib
import
Path
from
pathlib
import
Path
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
,
draw_span_bbox
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
,
draw_span_bbox
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
...
@@ -78,8 +79,8 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
...
@@ -78,8 +79,8 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
pdf_info
=
pipe
.
pdf_mid_data
[
'pdf_info'
]
pdf_info
=
pipe
.
pdf_mid_data
[
'pdf_info'
]
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
md_content
=
pipe
.
pipe_mk_markdown
(
image_dir
)
md_content
=
pipe
.
pipe_mk_markdown
(
image_dir
,
drop_mode
=
DropMode
.
NONE
)
#part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer
.
write
(
md_writer
.
write
(
content
=
md_content
,
path
=
f
"{pdf_file_name}.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
content
=
md_content
,
path
=
f
"{pdf_file_name}.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
)
)
...
@@ -89,7 +90,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
...
@@ -89,7 +90,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
mode
=
AbsReaderWriter
.
MODE_TXT
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
)
)
try
:
try
:
content_list
=
pipe
.
pipe_mk_uni_format
(
image_dir
)
content_list
=
pipe
.
pipe_mk_uni_format
(
image_dir
,
drop_mode
=
DropMode
.
NONE
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
exception
(
e
)
logger
.
exception
(
e
)
md_writer
.
write
(
md_writer
.
write
(
...
...
magic_pdf/pipe/AbsPipe.py
View file @
4bd31ced
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
magic_pdf.dict2md.mkcontent
import
mk_universal_format
,
mk_mm_markdown
from
magic_pdf.dict2md.mkcontent
import
mk_universal_format
,
mk_mm_markdown
from
magic_pdf.dict2md.ocr_mkcontent
import
make_standard_format_with_para
,
ocr_mk_mm_markdown_with_para
from
magic_pdf.dict2md.ocr_mkcontent
import
make_standard_format_with_para
,
ocr_mk_mm_markdown_with_para
,
union_make
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
magic_pdf.libs.MakeContentConfig
import
MakeMode
,
DropMode
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.libs.json_compressor
import
JsonCompressor
...
@@ -41,14 +42,14 @@ class AbsPipe(ABC):
...
@@ -41,14 +42,14 @@ class AbsPipe(ABC):
raise
NotImplementedError
raise
NotImplementedError
@
abstractmethod
@
abstractmethod
def
pipe_mk_uni_format
(
self
):
def
pipe_mk_uni_format
(
self
,
img_parent_path
,
drop_mode
):
"""
"""
有状态的组装统一格式
有状态的组装统一格式
"""
"""
raise
NotImplementedError
raise
NotImplementedError
@
abstractmethod
@
abstractmethod
def
pipe_mk_markdown
(
self
):
def
pipe_mk_markdown
(
self
,
img_parent_path
,
drop_mode
):
"""
"""
有状态的组装markdown
有状态的组装markdown
"""
"""
...
@@ -83,34 +84,23 @@ class AbsPipe(ABC):
...
@@ -83,34 +84,23 @@ class AbsPipe(ABC):
return
AbsPipe
.
PIP_OCR
return
AbsPipe
.
PIP_OCR
@
staticmethod
@
staticmethod
def
mk_uni_format
(
compressed_pdf_mid_data
:
str
,
img_buket_path
:
str
)
->
list
:
def
mk_uni_format
(
compressed_pdf_mid_data
:
str
,
img_buket_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
)
->
list
:
"""
"""
根据pdf类型,生成统一格式content_list
根据pdf类型,生成统一格式content_list
"""
"""
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
parse_type
=
pdf_mid_data
[
"_parse_type"
]
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
if
parse_type
==
AbsPipe
.
PIP_TXT
:
content_list
=
union_make
(
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
drop_mode
,
img_buket_path
)
# content_list = mk_universal_format(pdf_info_list, img_buket_path)
content_list
=
make_standard_format_with_para
(
pdf_info_list
,
img_buket_path
)
elif
parse_type
==
AbsPipe
.
PIP_OCR
:
content_list
=
make_standard_format_with_para
(
pdf_info_list
,
img_buket_path
)
return
content_list
return
content_list
@
staticmethod
@
staticmethod
def
mk_markdown
(
compressed_pdf_mid_data
:
str
,
img_buket_path
:
str
)
->
list
:
def
mk_markdown
(
compressed_pdf_mid_data
:
str
,
img_buket_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
)
->
list
:
"""
"""
根据pdf类型,markdown
根据pdf类型,markdown
"""
"""
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
parse_type
=
pdf_mid_data
[
"_parse_type"
]
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
if
parse_type
==
AbsPipe
.
PIP_TXT
:
md_content
=
union_make
(
pdf_info_list
,
MakeMode
.
MM_MD
,
drop_mode
,
img_buket_path
)
# content_list = mk_universal_format(pdf_info_list, img_buket_path)
# md_content = mk_mm_markdown(content_list)
md_content
=
ocr_mk_mm_markdown_with_para
(
pdf_info_list
,
img_buket_path
)
elif
parse_type
==
AbsPipe
.
PIP_OCR
:
md_content
=
ocr_mk_mm_markdown_with_para
(
pdf_info_list
,
img_buket_path
)
return
md_content
return
md_content
magic_pdf/pipe/OCRPipe.py
View file @
4bd31ced
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
...
@@ -15,10 +16,10 @@ class OCRPipe(AbsPipe):
...
@@ -15,10 +16,10 @@ class OCRPipe(AbsPipe):
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
):
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
)
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
,
drop_mode
)
return
content_list
return
content_list
def
pipe_mk_markdown
(
self
,
img_parent_path
:
str
):
def
pipe_mk_markdown
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
md_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
)
md_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
,
drop_mode
)
return
md_content
return
md_content
magic_pdf/pipe/TXTPipe.py
View file @
4bd31ced
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
...
@@ -15,10 +16,10 @@ class TXTPipe(AbsPipe):
...
@@ -15,10 +16,10 @@ class TXTPipe(AbsPipe):
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
):
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
)
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
,
drop_mode
)
return
content_list
return
content_list
def
pipe_mk_markdown
(
self
,
img_parent_path
:
str
):
def
pipe_mk_markdown
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
md_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
)
md_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
,
drop_mode
)
return
md_content
return
md_content
magic_pdf/pipe/UNIPipe.py
View file @
4bd31ced
import
json
import
json
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
...
@@ -25,12 +27,12 @@ class UNIPipe(AbsPipe):
...
@@ -25,12 +27,12 @@ class UNIPipe(AbsPipe):
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
is_debug
=
self
.
is_debug
)
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
):
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
)
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
,
drop_mode
)
return
content_list
return
content_list
def
pipe_mk_markdown
(
self
,
img_parent_path
:
str
):
def
pipe_mk_markdown
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
markdown_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
)
markdown_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
img_parent_path
,
drop_mode
)
return
markdown_content
return
markdown_content
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment