Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
1f45e0ab
Commit
1f45e0ab
authored
Apr 17, 2024
by
kernel.h@qq.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加debug模式
parent
f702defe
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
28 additions
and
17 deletions
+28
-17
magicpdf.py
magic_pdf/cli/magicpdf.py
+1
-2
magic_model.py
magic_pdf/model/magic_model.py
+13
-2
para_split.py
magic_pdf/para/para_split.py
+2
-2
AbsPipe.py
magic_pdf/pipe/AbsPipe.py
+2
-1
OCRPipe.py
magic_pdf/pipe/OCRPipe.py
+3
-3
TXTPipe.py
magic_pdf/pipe/TXTPipe.py
+3
-3
UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+4
-4
No files found.
magic_pdf/cli/magicpdf.py
View file @
1f45e0ab
...
@@ -35,7 +35,6 @@ from magic_pdf.libs.path_utils import (
...
@@ -35,7 +35,6 @@ from magic_pdf.libs.path_utils import (
from
magic_pdf.libs.config_reader
import
get_local_dir
from
magic_pdf.libs.config_reader
import
get_local_dir
from
magic_pdf.rw.S3ReaderWriter
import
S3ReaderWriter
,
MODE_BIN
,
MODE_TXT
from
magic_pdf.rw.S3ReaderWriter
import
S3ReaderWriter
,
MODE_BIN
,
MODE_TXT
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.libs.json_compressor
import
JsonCompressor
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
...
@@ -54,7 +53,7 @@ def prepare_env():
...
@@ -54,7 +53,7 @@ def prepare_env():
def
_do_parse
(
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
def
_do_parse
(
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
uni_pipe
=
UNIPipe
(
pdf_bytes
,
model_list
,
image_writer
,
image_dir
)
uni_pipe
=
UNIPipe
(
pdf_bytes
,
model_list
,
image_writer
,
image_dir
,
is_debug
=
True
)
jso_useful_key
=
{
jso_useful_key
=
{
"_pdf_type"
:
"txt"
,
"_pdf_type"
:
"txt"
,
"model_list"
:
model_list
,
"model_list"
:
model_list
,
...
...
magic_pdf/model/magic_model.py
View file @
1f45e0ab
...
@@ -17,6 +17,11 @@ class MagicModel():
...
@@ -17,6 +17,11 @@ class MagicModel():
def
get_imgs
(
self
,
page_no
:
int
):
# @许瑞
def
get_imgs
(
self
,
page_no
:
int
):
# @许瑞
return_lst
=
[]
return_lst
=
[]
image_block
=
{
}
img
=
{
img
=
{
"bbox"
:[
x0
,
y0
,
x1
,
y1
]
"bbox"
:[
x0
,
y0
,
x1
,
y1
]
}
}
...
@@ -24,10 +29,16 @@ class MagicModel():
...
@@ -24,10 +29,16 @@ class MagicModel():
"bbox"
:[
x0
,
y0
,
x1
,
y1
],
"bbox"
:[
x0
,
y0
,
x1
,
y1
],
"text"
:
""
,
"text"
:
""
,
}
}
return
[{
"img"
:
img
,
"caption"
:
img_caption
},]
image_block
[
'bbox'
]
=
[
x0
,
y0
,
x1
,
y1
]
# 计算出来
image_block
[
'img_body'
]
=
img
image_blcok
[
'img_caption'
]
=
img_caption
return
[
image_block
,]
def
get_tables
(
self
,
page_no
:
int
)
->
list
:
# 3个坐标, caption, table主体,table-note
def
get_tables
(
self
,
page_no
:
int
)
->
list
:
# 3个坐标, caption, table主体,table-note
pass
# 许瑞
pass
# 许瑞
, 结构和image一样
def
get_equations
(
self
,
page_no
:
int
)
->
list
:
# 有坐标,也有字
def
get_equations
(
self
,
page_no
:
int
)
->
list
:
# 有坐标,也有字
return
inline_equations
,
interline_equations
# @凯文
return
inline_equations
,
interline_equations
# @凯文
...
...
magic_pdf/para/para_split.py
View file @
1f45e0ab
...
@@ -549,7 +549,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
...
@@ -549,7 +549,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
if
"Table"
in
first_line_text
or
"Figure"
in
first_line_text
:
if
"Table"
in
first_line_text
or
"Figure"
in
first_line_text
:
pass
pass
if
debug_mode
:
if
debug_mode
:
logger
.
info
(
line_hi
.
std
())
logger
.
debug
(
line_hi
.
std
())
if
line_hi
.
std
()
<
2
:
if
line_hi
.
std
()
<
2
:
"""行高度相同,那么判断是否居中"""
"""行高度相同,那么判断是否居中"""
...
@@ -562,7 +562,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
...
@@ -562,7 +562,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
merge_para
=
[
l
[
0
]
for
l
in
layout_para
[
start
:
end
+
1
]]
merge_para
=
[
l
[
0
]
for
l
in
layout_para
[
start
:
end
+
1
]]
para_text
=
''
.
join
([
__get_span_text
(
span
)
for
line
in
merge_para
for
span
in
line
[
'spans'
]])
para_text
=
''
.
join
([
__get_span_text
(
span
)
for
line
in
merge_para
for
span
in
line
[
'spans'
]])
if
debug_mode
:
if
debug_mode
:
logger
.
info
(
para_text
)
logger
.
debug
(
para_text
)
layout_para
[
start
:
end
+
1
]
=
[
merge_para
]
layout_para
[
start
:
end
+
1
]
=
[
merge_para
]
index_offset
-=
end
-
start
index_offset
-=
end
-
start
...
...
magic_pdf/pipe/AbsPipe.py
View file @
1f45e0ab
...
@@ -16,12 +16,13 @@ class AbsPipe(ABC):
...
@@ -16,12 +16,13 @@ class AbsPipe(ABC):
PIP_OCR
=
"ocr"
PIP_OCR
=
"ocr"
PIP_TXT
=
"txt"
PIP_TXT
=
"txt"
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
,
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
,
is_debug
:
bool
=
False
):
self
.
pdf_bytes
=
pdf_bytes
self
.
pdf_bytes
=
pdf_bytes
self
.
model_list
=
model_list
self
.
model_list
=
model_list
self
.
image_writer
=
image_writer
self
.
image_writer
=
image_writer
self
.
img_parent_path
=
img_parent_path
self
.
img_parent_path
=
img_parent_path
self
.
pdf_mid_data
=
None
# 未压缩
self
.
pdf_mid_data
=
None
# 未压缩
self
.
is_debug
=
is_debug
def
get_compress_pdf_mid_data
(
self
):
def
get_compress_pdf_mid_data
(
self
):
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
...
...
magic_pdf/pipe/OCRPipe.py
View file @
1f45e0ab
...
@@ -6,14 +6,14 @@ from magic_pdf.user_api import parse_ocr_pdf
...
@@ -6,14 +6,14 @@ from magic_pdf.user_api import parse_ocr_pdf
class
OCRPipe
(
AbsPipe
):
class
OCRPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
,
is_debug
:
bool
=
False
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
img_parent_path
)
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
img_parent_path
,
is_debug
)
def
pipe_classify
(
self
):
def
pipe_classify
(
self
):
pass
pass
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
)
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
def
pipe_mk_uni_format
(
self
):
def
pipe_mk_uni_format
(
self
):
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
...
...
magic_pdf/pipe/TXTPipe.py
View file @
1f45e0ab
...
@@ -6,14 +6,14 @@ from magic_pdf.user_api import parse_txt_pdf
...
@@ -6,14 +6,14 @@ from magic_pdf.user_api import parse_txt_pdf
class
TXTPipe
(
AbsPipe
):
class
TXTPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
,
is_debug
:
bool
=
False
):
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
img_parent_path
)
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
img_parent_path
,
is_debug
)
def
pipe_classify
(
self
):
def
pipe_classify
(
self
):
pass
pass
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
)
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
def
pipe_mk_uni_format
(
self
):
def
pipe_mk_uni_format
(
self
):
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
...
...
magic_pdf/pipe/UNIPipe.py
View file @
1f45e0ab
...
@@ -15,18 +15,18 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
...
@@ -15,18 +15,18 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class
UNIPipe
(
AbsPipe
):
class
UNIPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
,
is_debug
:
bool
=
False
):
self
.
pdf_type
=
self
.
PIP_OCR
self
.
pdf_type
=
self
.
PIP_OCR
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
img_parent_path
)
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
img_parent_path
,
is_debug
)
def
pipe_classify
(
self
):
def
pipe_classify
(
self
):
self
.
pdf_type
=
UNIPipe
.
classify
(
self
.
pdf_bytes
)
self
.
pdf_type
=
UNIPipe
.
classify
(
self
.
pdf_bytes
)
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
if
self
.
pdf_type
==
self
.
PIP_TXT
:
if
self
.
pdf_type
==
self
.
PIP_TXT
:
self
.
pdf_mid_data
=
parse_union_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
)
self
.
pdf_mid_data
=
parse_union_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
)
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
def
pipe_mk_uni_format
(
self
):
def
pipe_mk_uni_format
(
self
):
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment