Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
698c4a83
Commit
698c4a83
authored
Apr 17, 2024
by
kernel.h@qq.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改pipe模块
parent
ffc20db7
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
28 additions
and
39 deletions
+28
-39
magicpdf.py
magic_pdf/cli/magicpdf.py
+3
-3
para_split.py
magic_pdf/para/para_split.py
+2
-0
AbsPipe.py
magic_pdf/pipe/AbsPipe.py
+8
-1
OCRPipe.py
magic_pdf/pipe/OCRPipe.py
+4
-10
TXTPipe.py
magic_pdf/pipe/TXTPipe.py
+4
-10
UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+7
-13
DiskReaderWriter.py
magic_pdf/rw/DiskReaderWriter.py
+0
-2
No files found.
magic_pdf/cli/magicpdf.py
View file @
698c4a83
...
...
@@ -62,13 +62,13 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
if
parse_method
==
"ocr"
:
jso_useful_key
[
"_pdf_type"
]
=
"ocr"
pdf_mid_data
=
uni_pipe
.
pipe_parse
()
md_content
=
UNIPipe
.
mk_markdown
(
pdf_mid_data
,
image_dir
)
uni_pipe
.
pipe_parse
()
md_content
=
uni_pipe
.
pipe_mk_markdown
(
)
part_file_name
=
datetime
.
now
()
.
strftime
(
"
%
H-
%
M-
%
S"
)
md_writer
.
write
(
content
=
md_content
,
path
=
f
"{part_file_name}.md"
,
mode
=
MODE_TXT
)
md_writer
.
write
(
content
=
json_parse
.
dumps
(
JsonCompressor
.
decompress_json
(
pdf_mid_data
)
,
ensure_ascii
=
False
,
indent
=
4
uni_pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"{part_file_name}.json"
,
mode
=
MODE_TXT
,
...
...
magic_pdf/para/para_split.py
View file @
698c4a83
...
...
@@ -589,6 +589,8 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
3. 参照上述行尾特征进行分段。
4. 图、表,目前独占一行,不考虑分段。
"""
if
page_num
==
343
:
pass
lines_group
=
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
)
# block内分段
layout_paras
,
layout_list_info
=
__split_para_in_layoutbox
(
lines_group
,
new_layout_bbox
,
lang
)
# layout内分段
layout_paras2
,
page_list_info
=
__connect_list_inter_layout
(
layout_paras
,
new_layout_bbox
,
layout_list_info
,
page_num
,
lang
)
# layout之间连接列表段落
...
...
magic_pdf/pipe/AbsPipe.py
View file @
698c4a83
...
...
@@ -13,11 +13,18 @@ class AbsPipe(ABC):
"""
txt和ocr处理的抽象类
"""
PIP_OCR
=
"ocr"
PIP_TXT
=
"txt"
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
,
):
self
.
pdf_bytes
=
pdf_bytes
self
.
model_list
=
model_list
self
.
image_writer
=
image_writer
self
.
img_parent_path
=
img_parent_path
self
.
pdf_mid_data
=
None
# 未压缩
def
get_compress_pdf_mid_data
(
self
):
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
@
abstractmethod
def
pipe_classify
(
self
):
...
...
magic_pdf/pipe/OCRPipe.py
View file @
698c4a83
...
...
@@ -6,25 +6,19 @@ from magic_pdf.user_api import parse_ocr_pdf
class
OCRPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_bucket_path
:
str
):
self
.
compressed_pdf_mid_data
=
None
self
.
pdf_mid_data
=
None
self
.
pdf_bytes
=
pdf_bytes
self
.
model_list
=
model_list
self
.
image_writer
=
image_writer
self
.
img_bucket_path
=
img_bucket_path
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
img_parent_path
)
def
pipe_classify
(
self
):
pass
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
)
self
.
compressed_pdf_mid_data
=
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
def
pipe_mk_uni_format
(
self
):
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
compressed_pdf_mid_data
,
self
.
img_bucke
t_path
)
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_paren
t_path
)
return
content_list
def
pipe_mk_markdown
(
self
):
md_content
=
AbsPipe
.
mk_markdown
(
self
.
compressed_pdf_mid_data
,
self
.
img_bucke
t_path
)
md_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_paren
t_path
)
return
md_content
magic_pdf/pipe/TXTPipe.py
View file @
698c4a83
...
...
@@ -6,25 +6,19 @@ from magic_pdf.user_api import parse_txt_pdf
class
TXTPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_bucket_path
:
str
):
self
.
compressed_pdf_mid_data
=
None
self
.
pdf_mid_data
=
None
self
.
pdf_bytes
=
pdf_bytes
self
.
model_list
=
model_list
self
.
image_writer
=
image_writer
self
.
img_bucket_path
=
img_bucket_path
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
img_parent_path
)
def
pipe_classify
(
self
):
pass
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
)
self
.
compressed_pdf_mid_data
=
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
def
pipe_mk_uni_format
(
self
):
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
compressed_pdf_mid_data
,
self
.
img_bucke
t_path
)
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_paren
t_path
)
return
content_list
def
pipe_mk_markdown
(
self
):
md_content
=
AbsPipe
.
mk_markdown
(
self
.
compressed_pdf_mid_data
,
self
.
img_bucke
t_path
)
md_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_paren
t_path
)
return
md_content
magic_pdf/pipe/UNIPipe.py
View file @
698c4a83
...
...
@@ -15,31 +15,25 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class
UNIPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_bucket_path
:
str
):
self
.
pdf_type
=
"ocr"
self
.
compressed_pdf_mid_data
=
None
self
.
pdf_mid_data
=
None
self
.
pdf_bytes
=
pdf_bytes
self
.
model_list
=
model_list
self
.
image_writer
=
image_writer
self
.
img_bucket_path
=
img_bucket_path
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
):
self
.
pdf_type
=
self
.
PIP_OCR
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
img_parent_path
)
def
pipe_classify
(
self
):
self
.
pdf_type
=
UNIPipe
.
classify
(
self
.
pdf_bytes
)
def
pipe_parse
(
self
):
if
self
.
pdf_type
==
"txt"
:
if
self
.
pdf_type
==
self
.
PIP_TXT
:
self
.
pdf_mid_data
=
parse_union_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
)
elif
self
.
pdf_type
==
"ocr"
:
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
)
self
.
compressed_pdf_mid_data
=
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
def
pipe_mk_uni_format
(
self
):
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
compressed_pdf_mid_data
,
self
.
img_bucke
t_path
)
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_paren
t_path
)
return
content_list
def
pipe_mk_markdown
(
self
):
markdown_content
=
AbsPipe
.
mk_markdown
(
self
.
compressed_pdf_mid_data
,
self
.
img_bucke
t_path
)
markdown_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_paren
t_path
)
return
markdown_content
if
__name__
==
'__main__'
:
...
...
magic_pdf/rw/DiskReaderWriter.py
View file @
698c4a83
...
...
@@ -41,12 +41,10 @@ class DiskReaderWriter(AbsReaderWriter):
if
mode
==
MODE_TXT
:
with
open
(
abspath
,
"w"
,
encoding
=
self
.
encoding
)
as
f
:
f
.
write
(
content
)
logger
.
info
(
f
"内容已成功写入 {abspath}"
)
elif
mode
==
MODE_BIN
:
with
open
(
abspath
,
"wb"
)
as
f
:
f
.
write
(
content
)
logger
.
info
(
f
"内容已成功写入 {abspath}"
)
else
:
raise
ValueError
(
"Invalid mode. Use 'text' or 'binary'."
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment