Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
f36c2656
Commit
f36c2656
authored
Mar 22, 2024
by
kernel.h@qq.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
使用面积占比方式判断一行文本是不是在一个layoutbox里
parent
a36ef4f8
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
99 additions
and
89 deletions
+99
-89
ocr_demo.py
demo/ocr_demo.py
+2
-2
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+19
-18
boxbase.py
magic_pdf/libs/boxbase.py
+27
-0
para_split.py
magic_pdf/para/para_split.py
+51
-69
No files found.
demo/ocr_demo.py
View file @
f36c2656
...
...
@@ -92,5 +92,5 @@ if __name__ == '__main__':
ocr_json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
# ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
#
ocr_local_parse(ocr_pdf_path, ocr_json_file_path)
ocr_online_parse
(
book_name
=
"美国加州中学教材/edu_00000060"
)
ocr_local_parse
(
ocr_pdf_path
,
ocr_json_file_path
)
#
ocr_online_parse(book_name="美国加州中学教材/edu_00000060")
magic_pdf/dict2md/ocr_mkcontent.py
View file @
f36c2656
...
...
@@ -71,9 +71,10 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
def
ocr_mk_mm_markdown_with_para
(
pdf_info_dict
:
dict
):
markdown
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
():
paras
=
page_info
.
get
(
"para_blocks"
)
if
not
paras
:
paras
_of_layout
=
page_info
.
get
(
"para_blocks"
)
if
not
paras
_of_layout
:
continue
for
paras
in
paras_of_layout
:
for
para
in
paras
:
para_text
=
''
for
line
in
para
:
...
...
@@ -81,12 +82,12 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
span_type
=
span
.
get
(
'type'
)
if
span_type
==
ContentType
.
Text
:
content
=
split_long_words
(
span
[
'content'
])
# content = span['content']
pass
elif
span_type
==
ContentType
.
InlineEquation
:
content
=
f
"${span['content']}$
"
content
=
f
" ${span['content']}$
"
elif
span_type
==
ContentType
.
InterlineEquation
:
content
=
f
"
\n
$$
\n
{span['content']}
\n
$$
\n
"
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
elif
span_type
in
[
ContentType
.
Image
,
ContentType
.
Table
]:
content
=
f
"
\n
})
\n
"
para_text
+=
content
+
' '
markdown
.
append
(
para_text
.
strip
()
+
' '
)
...
...
magic_pdf/libs/boxbase.py
View file @
f36c2656
...
...
@@ -18,6 +18,33 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
y1_1
<
y0_2
or
# box1在box2的上边
y0_1
>
y1_2
)
# box1在box2的下边
def
_is_in_or_part_overlap_with_area_ratio
(
box1
,
box2
,
area_ratio_threshold
=
0.6
):
"""
判断box1是否在box2里面,或者box1和box2有部分重叠,且重叠面积占box1的比例超过area_ratio_threshold
"""
if
box1
is
None
or
box2
is
None
:
return
False
x0_1
,
y0_1
,
x1_1
,
y1_1
=
box1
x0_2
,
y0_2
,
x1_2
,
y1_2
=
box2
if
not
_is_in_or_part_overlap
(
box1
,
box2
):
return
False
# 计算重叠面积
x_left
=
max
(
x0_1
,
x0_2
)
y_top
=
max
(
y0_1
,
y0_2
)
x_right
=
min
(
x1_1
,
x1_2
)
y_bottom
=
min
(
y1_1
,
y1_2
)
overlap_area
=
(
x_right
-
x_left
)
*
(
y_bottom
-
y_top
)
# 计算box1的面积
box1_area
=
(
x1_1
-
x0_1
)
*
(
y1_1
-
y0_1
)
return
overlap_area
/
box1_area
>
area_ratio_threshold
def
_is_in
(
box1
,
box2
)
->
bool
:
"""
box1是否完全在box2里面
...
...
magic_pdf/para/para_split.py
View file @
f36c2656
...
...
@@ -2,7 +2,7 @@ from sklearn.cluster import DBSCAN
import
numpy
as
np
from
loguru
import
logger
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap
_with_area_ratio
as
is_in_layout
from
magic_pdf.libs.ocr_content_type
import
ContentType
...
...
@@ -20,44 +20,6 @@ def __get_span_text(span):
return
c
def
__add_line_period
(
blocks
,
layout_bboxes
):
"""
为每行添加句号
如果这个行
1. 以行内公式结尾,但没有任何标点符号,此时加个句号,认为他就是段落结尾。
"""
for
block
in
blocks
:
for
line
in
block
[
'lines'
]:
last_span
=
line
[
'spans'
][
-
1
]
span_type
=
last_span
[
'type'
]
if
span_type
in
[
INLINE_EQUATION
]:
span_content
=
last_span
[
'content'
]
.
strip
()
if
span_type
==
INLINE_EQUATION
and
span_content
[
-
1
]
not
in
LINE_STOP_FLAG
:
if
span_type
in
[
INLINE_EQUATION
,
INTERLINE_EQUATION
]:
last_span
[
'content'
]
=
span_content
+
'.'
def
__detect_line_align_direction
(
line
,
new_layout_bboxes
):
"""
探测line是左对齐,还是右对齐,还是居中。
"""
lbox
=
line
[
'bbox'
]
x0
,
x1
=
lbox
[
0
],
lbox
[
2
]
layout_x0
,
layout_x1
=
new_layout_bboxes
[
0
],
new_layout_bboxes
[
2
]
if
x0
<=
layout_x0
and
x1
<
layout_x1
:
return
"left"
elif
x0
>
layout_x0
and
x1
>=
layout_x1
:
return
"right"
else
:
return
"center"
def
__detect_line_group_align_direction
(
lines
,
new_layout_bboxes
):
"""
首先把lines按照行距离分成几部分。针对每一部分分别探测。
最后返回[(dir, lines), (dir, lines), ...]
"""
pass
def
__detect_list_lines
(
lines
,
new_layout_bboxes
,
lang
=
'en'
):
"""
探测是否包含了列表,并且把列表的行分开.
...
...
@@ -166,7 +128,7 @@ def __valign_lines(blocks, layout_bboxes):
new_layout_bboxes
=
[]
for
layout_box
in
layout_bboxes
:
blocks_in_layoutbox
=
[
b
for
b
in
blocks
if
_is_in_or_part_overlap
(
b
[
'bbox'
],
layout_box
[
'layout_bbox'
])]
blocks_in_layoutbox
=
[
b
for
b
in
blocks
if
is_in_layout
(
b
[
'bbox'
],
layout_box
[
'layout_bbox'
])]
if
len
(
blocks_in_layoutbox
)
==
0
:
continue
...
...
@@ -252,7 +214,7 @@ def __group_line_by_layout(blocks, layout_bboxes, lang="en"):
lines_group
=
[]
for
lyout
in
layout_bboxes
:
lines
=
[
line
for
block
in
blocks
if
_is_in_or_part_overlap
(
block
[
'bbox'
],
lyout
[
'layout_bbox'
])
for
line
in
block
[
'lines'
]]
lines
=
[
line
for
block
in
blocks
if
is_in_layout
(
block
[
'bbox'
],
lyout
[
'layout_bbox'
])
for
line
in
block
[
'lines'
]]
lines_group
.
append
(
lines
)
return
lines_group
...
...
@@ -267,12 +229,19 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
且下一行开头不留空白。
"""
li
ne_group_end_with_list
=
[]
# 这个layout最后是不是列表,用于跨layout列表合并
paras
=
[]
li
st_info
=
[]
# 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
layout_
paras
=
[]
right_tail_distance
=
1.5
*
char_avg_len
for
lines
in
lines_group
:
paras
=
[]
total_lines
=
len
(
lines
)
if
total_lines
<=
1
:
# 0行无需处理。1行无法分段。
if
total_lines
==
0
:
continue
# 0行无需处理
if
total_lines
==
1
:
# 1行无法分段。
layout_paras
.
append
([
lines
])
list_info
.
append
([
False
,
False
])
continue
"""在进入到真正的分段之前,要对文字块从统计维度进行对齐方式的探测,
...
...
@@ -292,7 +261,7 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
layout_right
=
__find_layout_bbox_by_line
(
lines
[
0
][
'bbox'
],
new_layout_bbox
)[
2
]
layout_left
=
__find_layout_bbox_by_line
(
lines
[
0
][
'bbox'
],
new_layout_bbox
)[
0
]
para
=
[]
# 元素是line
is_lines_end_with_list
=
False
layout_list_info
=
[
False
,
False
]
# 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
for
content_type
,
start
,
end
in
text_segments
:
if
content_type
==
'list'
:
for
i
,
line
in
enumerate
(
lines
[
start
:
end
+
1
]):
...
...
@@ -307,7 +276,10 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
para
=
[]
is_lines_end_with_list
=
True
if
start
==
0
:
layout_list_info
[
0
]
=
True
if
end
==
total_lines
-
1
:
layout_list_info
[
1
]
=
True
else
:
for
i
,
line
in
enumerate
(
lines
[
start
:
end
+
1
]):
# 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断一下行结尾特征。
...
...
@@ -335,12 +307,18 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
para
=
[]
is_lines_end_with_list
=
False
line_group_end_with_list
.
append
(
is_lines_end_with_list
)
list_info
.
append
(
layout_list_info
)
layout_paras
.
append
(
paras
)
paras
=
[]
return
layout_paras
,
list_info
def
__connect_list_inter_layout
(
layout_paras
,
new_layout_bbox
,
layout_list_info
,
lang
=
"en"
):
# TODO
return
paras
,
line_group_end_with_list
return
layout_paras
def
__find_layout_bbox_by_line
(
line_bbox
,
layout_bboxes
):
...
...
@@ -348,12 +326,12 @@ def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
根据line找到所在的layout
"""
for
layout
in
layout_bboxes
:
if
_is_in_or_part_overlap
(
line_bbox
,
layout
):
if
is_in_layout
(
line_bbox
,
layout
):
return
layout
return
None
def
__connect_para_inter_layoutbox
(
layout_paras
,
new_layout_bbox
,
l
ine_group_end_with_list
,
l
ang
=
"en"
):
def
__connect_para_inter_layoutbox
(
layout_paras
,
new_layout_bbox
,
lang
=
"en"
):
"""
layout之间进行分段。
主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
...
...
@@ -363,21 +341,18 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, line_group_end
"""
connected_layout_paras
=
[]
for
i
,
para
in
enumerate
(
layout_paras
):
if
i
==
0
:
connected_layout_paras
.
append
(
para
)
continue
pre_last_line
=
layout_paras
[
i
-
1
][
-
1
]
next_first_line
=
layout_paras
[
i
][
0
]
connected_layout_paras
.
append
(
layout_paras
[
0
])
for
i
in
range
(
1
,
len
(
layout_paras
)):
pre_last_line
=
layout_paras
[
i
-
1
][
-
1
][
-
1
]
next_first_line
=
layout_paras
[
i
][
0
][
0
]
pre_last_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
pre_last_line
[
'spans'
]])
pre_last_line_type
=
pre_last_line
[
'spans'
][
-
1
][
'type'
]
next_first_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
next_first_line
[
'spans'
]])
next_first_line_type
=
next_first_line
[
'spans'
][
0
][
'type'
]
if
pre_last_line_type
not
in
[
TEXT
,
INLINE_EQUATION
]
or
next_first_line_type
not
in
[
TEXT
,
INLINE_EQUATION
]:
# TODO,真的要做好,要考虑跨table, image, 行间的情况
connected_layout_paras
.
append
(
para
)
if
pre_last_line_type
not
in
[
TEXT
,
INLINE_EQUATION
]
or
next_first_line_type
not
in
[
TEXT
,
INLINE_EQUATION
]:
connected_layout_paras
.
append
(
layout_paras
[
i
]
)
continue
pre_x2_max
=
__find_layout_bbox_by_line
(
pre_last_line
[
'bbox'
],
new_layout_bbox
)[
2
]
next_x0_min
=
__find_layout_bbox_by_line
(
next_first_line
[
'bbox'
],
new_layout_bbox
)[
0
]
...
...
@@ -385,10 +360,15 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, line_group_end
next_first_line_text
=
next_first_line_text
.
strip
()
if
pre_last_line
[
'bbox'
][
2
]
==
pre_x2_max
and
pre_last_line_text
[
-
1
]
not
in
LINE_STOP_FLAG
and
next_first_line
[
'bbox'
][
0
]
==
next_x0_min
:
# 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
"""连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
connected_layout_paras
[
-
1
]
.
extend
(
para
)
connected_layout_paras
[
-
1
][
-
1
]
.
extend
(
layout_paras
[
i
][
0
])
layout_paras
[
i
]
.
pop
(
0
)
# 删除后一个layout的第一个段落, 因为他已经被合并到前一个layout的最后一个段落了。
if
len
(
layout_paras
[
i
])
==
0
:
layout_paras
.
pop
(
i
)
else
:
connected_layout_paras
.
append
(
layout_paras
[
i
])
else
:
"""连接段落条件不成立,将前一个layout的段落加入到结果中。"""
connected_layout_paras
.
append
(
para
)
connected_layout_paras
.
append
(
layout_paras
[
i
]
)
return
connected_layout_paras
...
...
@@ -403,8 +383,8 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
# 有的页面可能压根没有文字
if
len
(
pre_page_paras
)
==
0
or
len
(
next_page_paras
)
==
0
:
return
False
pre_last_para
=
pre_page_paras
[
-
1
]
next_first_para
=
next_page_paras
[
0
]
pre_last_para
=
pre_page_paras
[
-
1
]
[
-
1
]
next_first_para
=
next_page_paras
[
0
]
[
0
]
pre_last_line
=
pre_last_para
[
-
1
]
next_first_line
=
next_first_para
[
0
]
pre_last_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
pre_last_line
[
'spans'
]])
...
...
@@ -423,8 +403,8 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
next_first_line_text
=
next_first_line_text
.
strip
()
if
pre_last_line
[
'bbox'
][
2
]
==
pre_x2_max
and
pre_last_line_text
[
-
1
]
not
in
LINE_STOP_FLAG
and
next_first_line
[
'bbox'
][
0
]
==
next_x0_min
:
# 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
"""连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
pre_
page_paras
[
-
1
]
.
extend
(
next_first_para
)
next_page_paras
.
pop
(
0
)
# 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。
pre_
last_para
.
extend
(
next_first_para
)
next_page_paras
[
0
]
.
pop
(
0
)
# 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。
return
True
else
:
return
False
...
...
@@ -443,8 +423,10 @@ def __do_split(blocks, layout_bboxes, new_layout_bbox, lang="en"):
4. 图、表,目前独占一行,不考虑分段。
"""
lines_group
=
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
)
# block内分段
layout_paras
,
line_group_end_with_list
=
__split_para_in_layoutbox
(
lines_group
,
new_layout_bbox
,
lang
)
# layout内分段
connected_layout_paras
=
__connect_para_inter_layoutbox
(
layout_paras
,
new_layout_bbox
,
line_group_end_with_list
,
lang
)
# layout间链接段落
layout_paras
,
layout_list_info
=
__split_para_in_layoutbox
(
lines_group
,
new_layout_bbox
,
lang
)
# layout内分段
layout_paras2
=
__connect_list_inter_layout
(
layout_paras
,
new_layout_bbox
,
layout_list_info
,
lang
)
# layout之间连接列表段落
connected_layout_paras
=
__connect_para_inter_layoutbox
(
layout_paras2
,
new_layout_bbox
,
lang
)
# layout间链接段落
return
connected_layout_paras
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment