Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
084e9328
Commit
084e9328
authored
Mar 14, 2024
by
xuchao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
实现layout内部分段
parent
f68c6629
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
246 additions
and
9 deletions
+246
-9
.gitignore
.gitignore
+1
-0
ocr_demo.py
demo/ocr_demo.py
+6
-5
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+27
-0
para_split.py
magic_pdf/para/para_split.py
+206
-0
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+6
-4
No files found.
.gitignore
View file @
084e9328
...
...
@@ -31,5 +31,6 @@ tmp
.vscode
.vscode/
/tests/
ocr_demo
/app/common/__init__.py
demo/ocr_demo.py
View file @
084e9328
...
...
@@ -4,7 +4,7 @@ import os
from
loguru
import
logger
from
pathlib
import
Path
from
magic_pdf.dict2md.ocr_mkcontent
import
ocr_mk_nlp_markdown
,
ocr_
mk_mm_markdown
from
magic_pdf.dict2md.ocr_mkcontent
import
mk_mm_markdown2
,
mk_nlp_markdown
,
mk_mm_markdown
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
...
...
@@ -34,8 +34,9 @@ if __name__ == '__main__':
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_fix.json"
ocr_pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
ocr_json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
try
:
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
...
...
@@ -56,8 +57,8 @@ if __name__ == '__main__':
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
# markdown_content =
ocr_
mk_nlp_markdown(pdf_info_dict)
markdown_content
=
ocr_mk_mm_markdown
(
pdf_info_dict
)
# markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content
=
mk_mm_markdown2
(
pdf_info_dict
)
with
open
(
text_content_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
markdown_content
)
...
...
magic_pdf/dict2md/ocr_mkcontent.py
View file @
084e9328
...
...
@@ -53,3 +53,30 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
# 在行末添加两个空格以强制换行
markdown
.
append
(
line_text
.
strip
()
+
' '
)
return
'
\n
'
.
join
(
markdown
)
def
mk_mm_markdown2
(
pdf_info_dict
:
dict
):
markdown
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
():
paras
=
page_info
.
get
(
"para_blocks"
)
if
not
paras
:
continue
for
para
in
paras
:
para_text
=
''
for
line
in
para
:
for
span
in
line
[
'spans'
]:
span_type
=
span
.
get
(
'type'
)
if
span_type
==
'text'
:
para_text
+=
span
[
'content'
]
elif
span_type
==
'inline_equation'
:
para_text
+=
f
" ${span['content']}$ "
elif
span_type
==
'displayed_equation'
:
para_text
+=
f
"$$
\n
{span['content']}
\n
$$ "
elif
span_type
==
'image'
:
para_text
+=
f
" "
markdown
.
append
(
para_text
)
return
'
\n\n
'
.
join
(
markdown
)
\ No newline at end of file
magic_pdf/para/para_split.py
0 → 100644
View file @
084e9328
from
sklearn.cluster
import
DBSCAN
import
numpy
as
np
from
loguru
import
logger
from
magic_pdf.libs.boxbase
import
_is_in
LINE_STOP_FLAG
=
[
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
":"
,
":"
,
")"
,
")"
,
";"
]
INLINE_EQUATION
=
'inline_equation'
INTER_EQUATION
=
"displayed_equation"
TEXT
=
"text"
def
__add_line_period
(
blocks
,
layout_bboxes
):
"""
为每行添加句号
如果这个行
1. 以行内公式结尾,但没有任何标点符号,此时加个句号,认为他就是段落结尾。
"""
for
block
in
blocks
:
for
line
in
block
[
'lines'
]:
last_span
=
line
[
'spans'
][
-
1
]
span_type
=
last_span
[
'type'
]
if
span_type
in
[
TEXT
,
INLINE_EQUATION
]:
span_content
=
last_span
[
'content'
]
.
strip
()
if
span_type
==
INLINE_EQUATION
and
span_content
[
-
1
]
not
in
LINE_STOP_FLAG
:
if
span_type
in
[
INLINE_EQUATION
,
INTER_EQUATION
]:
last_span
[
'content'
]
=
span_content
+
'.'
def
__valign_lines
(
blocks
,
layout_bboxes
):
"""
对齐行的左侧和右侧。
扫描行的左侧和右侧,如果x0, x1差距不超过3就强行对齐到所处layout的左右两侧(和layout有一段距离)。
3是个经验值,TODO,计算得来
"""
min_distance
=
3
min_sample
=
2
for
layout_box
in
layout_bboxes
:
blocks_in_layoutbox
=
[
b
for
b
in
blocks
if
_is_in
(
b
[
'bbox'
],
layout_box
[
'layout_bbox'
])]
if
len
(
blocks_in_layoutbox
)
==
0
:
continue
x0_lst
=
np
.
array
([[
line
[
'bbox'
][
0
],
0
]
for
block
in
blocks_in_layoutbox
for
line
in
block
[
'lines'
]])
x1_lst
=
np
.
array
([[
line
[
'bbox'
][
2
],
0
]
for
block
in
blocks_in_layoutbox
for
line
in
block
[
'lines'
]])
x0_clusters
=
DBSCAN
(
eps
=
min_distance
,
min_samples
=
min_sample
)
.
fit
(
x0_lst
)
x1_clusters
=
DBSCAN
(
eps
=
min_distance
,
min_samples
=
min_sample
)
.
fit
(
x1_lst
)
x0_uniq_label
=
np
.
unique
(
x0_clusters
.
labels_
)
x1_uniq_label
=
np
.
unique
(
x1_clusters
.
labels_
)
x0_2_new_val
=
{}
# 存储旧值对应的新值映射
x1_2_new_val
=
{}
for
label
in
x0_uniq_label
:
if
label
==-
1
:
continue
x0_index_of_label
=
np
.
where
(
x0_clusters
.
labels_
==
label
)
x0_raw_val
=
x0_lst
[
x0_index_of_label
][:,
0
]
x0_new_val
=
np
.
min
(
x0_lst
[
x0_index_of_label
][:,
0
])
x0_2_new_val
.
update
({
idx
:
x0_new_val
for
idx
in
x0_raw_val
})
for
label
in
x1_uniq_label
:
if
label
==-
1
:
continue
x1_index_of_label
=
np
.
where
(
x1_clusters
.
labels_
==
label
)
x1_raw_val
=
x1_lst
[
x1_index_of_label
][:,
0
]
x1_new_val
=
np
.
max
(
x1_lst
[
x1_index_of_label
][:,
0
])
x1_2_new_val
.
update
({
idx
:
x1_new_val
for
idx
in
x1_raw_val
})
for
block
in
blocks_in_layoutbox
:
for
line
in
block
[
'lines'
]:
x0
,
x1
=
line
[
'bbox'
][
0
],
line
[
'bbox'
][
2
]
if
x0
in
x0_2_new_val
:
line
[
'bbox'
][
0
]
=
int
(
x0_2_new_val
[
x0
])
if
x1
in
x1_2_new_val
:
line
[
'bbox'
][
2
]
=
int
(
x1_2_new_val
[
x1
])
# 其余对不齐的保持不动
# 由于修改了block里的line长度,现在需要重新计算block的bbox
for
block
in
blocks_in_layoutbox
:
block
[
'bbox'
]
=
[
min
([
line
[
'bbox'
][
0
]
for
line
in
block
[
'lines'
]]),
min
([
line
[
'bbox'
][
1
]
for
line
in
block
[
'lines'
]]),
max
([
line
[
'bbox'
][
2
]
for
line
in
block
[
'lines'
]]),
max
([
line
[
'bbox'
][
3
]
for
line
in
block
[
'lines'
]])]
def
__common_pre_proc
(
blocks
,
layout_bboxes
):
"""
不分语言的,对文本进行预处理
"""
__add_line_period
(
blocks
,
layout_bboxes
)
__valign_lines
(
blocks
,
layout_bboxes
)
def
__pre_proc_zh_blocks
(
blocks
,
layout_bboxes
):
"""
对中文文本进行分段预处理
"""
pass
def
__pre_proc_en_blocks
(
blocks
,
layout_bboxes
):
"""
对英文文本进行分段预处理
"""
pass
def
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
=
"en"
):
"""
每个layout内的行进行聚合
"""
# 因为只是一个block一行目前, 一个block就是一个段落
lines_group
=
[]
for
lyout
in
layout_bboxes
:
lines
=
[
line
for
block
in
blocks
if
_is_in
(
block
[
'bbox'
],
lyout
[
'layout_bbox'
])
for
line
in
block
[
'lines'
]]
lines_group
.
append
(
lines
)
return
lines_group
def
__split_para_in_layoutbox
(
lines_group
,
layout_bboxes
,
lang
=
"en"
,
char_avg_len
=
10
):
"""
lines_group 进行行分段——layout内部进行分段。
1. 先计算每个group的左右边界。
2. 然后根据行末尾特征进行分段。
末尾特征:以句号等结束符结尾。并且距离右侧边界有一定距离。
"""
def
get_span_text
(
span
):
c
=
span
.
get
(
'content'
,
''
)
if
len
(
c
)
==
0
:
c
=
span
.
get
(
'image-path'
,
''
)
return
c
paras
=
[]
right_tail_distance
=
1.5
*
char_avg_len
for
lines
in
lines_group
:
if
len
(
lines
)
==
0
:
continue
layout_right
=
max
([
line
[
'bbox'
][
2
]
for
line
in
lines
])
para
=
[]
# 元素是line
for
line
in
lines
:
line_text
=
''
.
join
([
get_span_text
(
span
)
for
span
in
line
[
'spans'
]])
#logger.info(line_text)
last_span_type
=
line
[
'spans'
][
-
1
][
'type'
]
if
last_span_type
in
[
TEXT
,
INLINE_EQUATION
]:
last_char
=
line
[
'spans'
][
-
1
][
'content'
][
-
1
]
if
last_char
in
LINE_STOP_FLAG
or
line
[
'bbox'
][
2
]
<
layout_right
-
right_tail_distance
:
para
.
append
(
line
)
paras
.
append
(
para
)
# para_text = ''.join([span['content'] for line in para for span in line['spans']])
# logger.info(para_text)
para
=
[]
else
:
para
.
append
(
line
)
else
:
# 其他,图片、表格、行间公式,各自占一段
para
.
append
(
line
)
paras
.
append
(
para
)
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# logger.info(para_text)
para
=
[]
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# logger.info(para_text)
para
=
[]
return
paras
def
__do_split
(
blocks
,
layout_bboxes
,
lang
=
"en"
):
"""
根据line和layout情况进行分段
先实现一个根据行末尾特征分段的简单方法。
"""
"""
算法思路:
1. 扫描layout里每一行,找出来行尾距离layout有边界有一定距离的行。
2. 从上述行中找到末尾是句号等可作为断行标志的行。
3. 参照上述行尾特征进行分段。
4. 图、表,目前独占一行,不考虑分段。
"""
lines_group
=
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
)
# block内分段
layout_paras
=
__split_para_in_layoutbox
(
lines_group
,
layout_bboxes
,
lang
)
# block间连接分段
return
layout_paras
def
para_split
(
blocks
,
layout_bboxes
,
lang
=
"en"
):
"""
根据line和layout情况进行分段
"""
__common_pre_proc
(
blocks
,
layout_bboxes
)
if
lang
==
'en'
:
__do_split
(
blocks
,
layout_bboxes
,
lang
)
elif
lang
==
'zh'
:
__do_split
(
blocks
,
layout_bboxes
,
lang
)
splited_blocks
=
__do_split
(
blocks
,
layout_bboxes
,
lang
)
return
splited_blocks
magic_pdf/pdf_parse_by_ocr.py
View file @
084e9328
...
...
@@ -16,6 +16,7 @@ from magic_pdf.libs.commons import (
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.para.para_split
import
para_split
from
magic_pdf.pre_proc.detect_footer_by_model
import
parse_footers
from
magic_pdf.pre_proc.detect_footnote
import
parse_footnotes_by_model
from
magic_pdf.pre_proc.detect_header
import
parse_headers
...
...
@@ -31,12 +32,13 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
def
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
def
construct_page_component
(
blocks
,
para_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
need_remove_spans_bboxes_dict
):
return_dict
=
{
'preproc_blocks'
:
blocks
,
"para_blocks"
:
para_blocks
,
# 分好段落的blocks
'layout_bboxes'
:
layout_bboxes
,
'page_idx'
:
page_id
,
'page_size'
:
[
page_w
,
page_h
],
...
...
@@ -234,13 +236,13 @@ def parse_pdf_by_ocr(
blocks
=
merge_lines_to_block
(
lines
)
# 根据block合并段落
para_blocks
=
para_split
(
blocks
,
layout_bboxes
)
# 获取QA需要外置的list
images
,
tables
,
interline_equations
,
inline_equations
=
get_qa_need_list
(
blocks
)
# 构造pdf_info_dict
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
page_info
=
construct_page_component
(
blocks
,
para_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
need_remove_spans_bboxes_dict
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment