Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
91d296de
Unverified
Commit
91d296de
authored
Apr 19, 2024
by
myhloli
Committed by
GitHub
Apr 19, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #31 from myhloli/master
重构 parse_by_ocr_v2.py
parents
0f5e0b01
f5341e16
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
151 additions
and
45 deletions
+151
-45
ocr_content_type.py
magic_pdf/libs/ocr_content_type.py
+13
-0
parse_by_ocr_v2.py
magic_pdf/parse_by_ocr_v2.py
+41
-3
construct_page_dict.py
magic_pdf/pre_proc/construct_page_dict.py
+28
-11
ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+9
-6
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+17
-12
ocr_fix_block_logic.py
magic_pdf/pre_proc/ocr_fix_block_logic.py
+18
-6
ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+25
-7
No files found.
magic_pdf/libs/ocr_content_type.py
View file @
91d296de
...
@@ -5,3 +5,16 @@ class ContentType:
...
@@ -5,3 +5,16 @@ class ContentType:
InlineEquation
=
"inline_equation"
InlineEquation
=
"inline_equation"
InterlineEquation
=
"interline_equation"
InterlineEquation
=
"interline_equation"
class
BlockType
:
Image
=
"image"
ImageBody
=
"image_body"
ImageCaption
=
"image_caption"
Table
=
"table"
TableBody
=
"table_body"
TableCaption
=
"table_caption"
TableFootnote
=
"table_footnote"
Text
=
"text"
Title
=
"title"
InterlineEquation
=
"interline_equation"
Footnote
=
"footnote"
magic_pdf/parse_by_ocr_v2.py
View file @
91d296de
import
time
from
loguru
import
logger
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.commons
import
fitz
from
magic_pdf.libs.commons
import
fitz
,
get_delta_time
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.pre_proc.construct_page_dict
import
ocr_construct_page_component_v2
from
magic_pdf.pre_proc.cut_image
import
ocr_cut_image_and_table
from
magic_pdf.pre_proc.cut_image
import
ocr_cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
def
parse_pdf_by_ocr
(
pdf_bytes
,
def
parse_pdf_by_ocr
(
pdf_bytes
,
...
@@ -15,17 +21,31 @@ def parse_pdf_by_ocr(pdf_bytes,
...
@@ -15,17 +21,31 @@ def parse_pdf_by_ocr(pdf_bytes,
end_page_id
=
None
,
end_page_id
=
None
,
debug_mode
=
False
,
debug_mode
=
False
,
):
):
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
'''初始化空的pdf_info_dict'''
pdf_info_dict
=
{}
'''用model_list和docs对象初始化magic_model'''
'''用model_list和docs对象初始化magic_model'''
magic_model
=
MagicModel
(
model_list
,
pdf_docs
)
magic_model
=
MagicModel
(
model_list
,
pdf_docs
)
'''根据输入的起始范围解析pdf'''
'''根据输入的起始范围解析pdf'''
end_page_id
=
end_page_id
if
end_page_id
else
len
(
pdf_docs
)
-
1
end_page_id
=
end_page_id
if
end_page_id
else
len
(
pdf_docs
)
-
1
'''初始化启动时间'''
start_time
=
time
.
time
()
for
page_id
in
range
(
start_page_id
,
end_page_id
+
1
):
for
page_id
in
range
(
start_page_id
,
end_page_id
+
1
):
'''debug时输出每页解析的耗时'''
if
debug_mode
:
time_now
=
time
.
time
()
logger
.
info
(
f
"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
)
start_time
=
time_now
'''从magic_model对象中获取后面会用到的区块信息'''
'''从magic_model对象中获取后面会用到的区块信息'''
img_blocks
=
magic_model
.
get_imgs
(
page_id
)
img_blocks
=
magic_model
.
get_imgs
(
page_id
)
table_blocks
=
magic_model
.
get_tables
(
page_id
)
table_blocks
=
magic_model
.
get_tables
(
page_id
)
...
@@ -61,3 +81,21 @@ def parse_pdf_by_ocr(pdf_bytes,
...
@@ -61,3 +81,21 @@ def parse_pdf_by_ocr(pdf_bytes,
'''对block进行fix操作'''
'''对block进行fix操作'''
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
)
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
)
'''获取QA需要外置的list'''
images
,
tables
,
interline_equations
=
get_qa_need_list_v2
(
fix_blocks
)
'''构造pdf_info_dict'''
page_info
=
ocr_construct_page_component_v2
(
fix_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
discarded_blocks
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
"""分段"""
pass
"""dict转list"""
pdf_info_list
=
dict_to_list
(
pdf_info_dict
)
new_pdf_info_dict
=
{
"pdf_info"
:
pdf_info_list
,
}
return
new_pdf_info_dict
magic_pdf/pre_proc/construct_page_dict.py
View file @
91d296de
def
construct_page_component
(
page_id
,
image_info
,
table_info
,
text_blocks_preproc
,
layout_bboxes
,
inline_eq_info
,
def
construct_page_component
(
page_id
,
image_info
,
table_info
,
text_blocks_preproc
,
layout_bboxes
,
inline_eq_info
,
interline_eq_info
,
raw_pymu_blocks
,
interline_eq_info
,
raw_pymu_blocks
,
removed_text_blocks
,
removed_image_blocks
,
images_backup
,
droped_table_block
,
table_backup
,
layout_tree
,
removed_text_blocks
,
removed_image_blocks
,
images_backup
,
droped_table_block
,
table_backup
,
layout_tree
,
page_w
,
page_h
,
footnote_bboxes_tmp
):
page_w
,
page_h
,
footnote_bboxes_tmp
):
"""
"""
...
@@ -51,3 +52,19 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h,
...
@@ -51,3 +52,19 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h,
'droped_bboxes'
:
need_remove_spans_bboxes_dict
,
'droped_bboxes'
:
need_remove_spans_bboxes_dict
,
}
}
return
return_dict
return
return_dict
def
ocr_construct_page_component_v2
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
droped_blocks
):
return_dict
=
{
'preproc_blocks'
:
blocks
,
'layout_bboxes'
:
layout_bboxes
,
'page_idx'
:
page_id
,
'page_size'
:
[
page_w
,
page_h
],
'_layout_tree'
:
layout_tree
,
'images'
:
images
,
'tables'
:
tables
,
'interline_equations'
:
interline_equations
,
'droped_blocks'
:
droped_blocks
,
}
return
return_dict
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
91d296de
from
magic_pdf.libs.ocr_content_type
import
BlockType
def
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
def
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
all_bboxes
=
[]
all_bboxes
=
[]
for
image
in
img_blocks
:
for
image
in
img_blocks
:
x0
,
y0
,
x1
,
y1
=
image
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
image
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
'image_block'
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Image
,
None
,
None
,
None
,
None
])
for
table
in
table_blocks
:
for
table
in
table_blocks
:
x0
,
y0
,
x1
,
y1
=
table
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
table
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
'table_block'
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Table
,
None
,
None
,
None
,
None
])
for
text
in
text_blocks
:
for
text
in
text_blocks
:
x0
,
y0
,
x1
,
y1
=
text
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
text
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
'text_block'
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Text
,
None
,
None
,
None
,
None
])
for
title
in
title_blocks
:
for
title
in
title_blocks
:
x0
,
y0
,
x1
,
y1
=
title
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
title
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
'title_block'
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Title
,
None
,
None
,
None
,
None
])
for
interline_equation
in
interline_equation_blocks
:
for
interline_equation
in
interline_equation_blocks
:
x0
,
y0
,
x1
,
y1
=
interline_equation
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
interline_equation
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
'interline_equation_block'
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
InterlineEquation
,
None
,
None
,
None
,
None
])
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50
%
区域的(限定footnote)'''
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50
%
区域的(限定footnote)'''
for
discarded
in
discarded_blocks
:
for
discarded
in
discarded_blocks
:
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
'footnote'
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Footnote
,
None
,
None
,
None
,
None
])
return
all_bboxes
return
all_bboxes
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
91d296de
...
@@ -3,8 +3,8 @@ from loguru import logger
...
@@ -3,8 +3,8 @@ from loguru import logger
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
,
get_minbox_if_overlap_by_ratio
,
\
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
,
get_minbox_if_overlap_by_ratio
,
\
calculate_overlap_area_in_bbox1_area_ratio
calculate_overlap_area_in_bbox1_area_ratio
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.pre_proc.ocr_fix_block_logic
import
fix_image_block
,
fix_table_block
from
magic_pdf.pre_proc.ocr_fix_block_logic
import
fix_image_block
,
fix_table_block
,
fix_text_block
# 将每一个line中的span从左到右排序
# 将每一个line中的span从左到右排序
...
@@ -117,7 +117,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
...
@@ -117,7 +117,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
layout_blocks
=
[]
layout_blocks
=
[]
for
block
in
all_bboxes
:
for
block
in
all_bboxes
:
# 如果是footnote则跳过
# 如果是footnote则跳过
if
block
[
7
]
==
'footnote'
:
if
block
[
7
]
==
BlockType
.
Footnote
:
continue
continue
block_bbox
=
[
block
[
0
],
block
[
1
],
block
[
2
],
block
[
3
]]
block_bbox
=
[
block
[
0
],
block
[
1
],
block
[
2
],
block
[
3
]]
if
calculate_overlap_area_in_bbox1_area_ratio
(
block_bbox
,
layout_bbox
)
>
0.8
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
block_bbox
,
layout_bbox
)
>
0.8
:
...
@@ -141,6 +141,9 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
...
@@ -141,6 +141,9 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
def
fill_spans_in_blocks
(
blocks
,
spans
):
def
fill_spans_in_blocks
(
blocks
,
spans
):
'''
将allspans中的span按位置关系,放入blocks中
'''
block_with_spans
=
[]
block_with_spans
=
[]
for
block
in
blocks
:
for
block
in
blocks
:
block_type
=
block
[
7
]
block_type
=
block
[
7
]
...
@@ -166,20 +169,22 @@ def fill_spans_in_blocks(blocks, spans):
...
@@ -166,20 +169,22 @@ def fill_spans_in_blocks(blocks, spans):
def
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
):
def
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
):
'''
1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中
2、同时需要删除block中的spans字段
'''
fix_blocks
=
[]
fix_blocks
=
[]
for
block
in
block_with_spans
:
for
block
in
block_with_spans
:
block_type
=
block
[
'block_type'
]
block_type
=
block
[
'block_type'
]
# 只有type为image_block和table_block才需要处理
if
block_type
==
'image_block'
:
if
block_type
==
BlockType
.
Image
:
block
=
fix_image_block
(
block
,
img_blocks
)
block
=
fix_image_block
(
block
,
img_blocks
)
elif
block_type
==
'table_block'
:
elif
block_type
==
BlockType
.
Table
:
block
=
fix_table_block
(
block
,
table_blocks
)
block
=
fix_table_block
(
block
,
table_blocks
)
elif
block_type
==
'text_block'
:
elif
block_type
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
InterlineEquation
]:
pass
block
=
fix_text_block
(
block
)
elif
block_type
==
'title_block'
:
pass
elif
block_type
==
'interline_equation_block'
:
pass
else
:
else
:
continue
continue
fix_blocks
.
append
(
block
)
fix_blocks
.
append
(
block
)
...
...
magic_pdf/pre_proc/ocr_fix_block_logic.py
View file @
91d296de
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.pre_proc.ocr_dict_merge
import
merge_spans_to_line
,
line_sort_spans_by_left_to_right
from
magic_pdf.pre_proc.ocr_dict_merge
import
merge_spans_to_line
,
line_sort_spans_by_left_to_right
def
merge_spans_to_block
(
spans
:
list
,
block_bbox
:
list
,
block_type
:
str
):
def
merge_spans_to_block
(
spans
:
list
,
block_bbox
:
list
,
block_type
:
str
):
block_spans
=
[]
block_spans
=
[]
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
...
@@ -18,6 +19,7 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
...
@@ -18,6 +19,7 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
}
}
return
block
,
block_spans
return
block
,
block_spans
def
make_body_block
(
span
:
dict
,
block_bbox
:
list
,
block_type
:
str
):
def
make_body_block
(
span
:
dict
,
block_bbox
:
list
,
block_type
:
str
):
# 创建body_block
# 创建body_block
body_line
=
{
body_line
=
{
...
@@ -41,7 +43,7 @@ def fix_image_block(block, img_blocks):
...
@@ -41,7 +43,7 @@ def fix_image_block(block, img_blocks):
for
span
in
block
[
'spans'
]:
for
span
in
block
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Image
and
span
[
'bbox'
]
==
img_block
[
'img_body_bbox'
]:
if
span
[
'type'
]
==
ContentType
.
Image
and
span
[
'bbox'
]
==
img_block
[
'img_body_bbox'
]:
# 创建img_body_block
# 创建img_body_block
img_body_block
=
make_body_block
(
span
,
img_block
[
'img_body_bbox'
],
'img_body_block'
)
img_body_block
=
make_body_block
(
span
,
img_block
[
'img_body_bbox'
],
BlockType
.
ImageBody
)
block
[
'blocks'
]
.
append
(
img_body_block
)
block
[
'blocks'
]
.
append
(
img_body_block
)
# 从spans中移除img_body_block中已经放入的span
# 从spans中移除img_body_block中已经放入的span
...
@@ -51,7 +53,7 @@ def fix_image_block(block, img_blocks):
...
@@ -51,7 +53,7 @@ def fix_image_block(block, img_blocks):
# 根据list长度,判断img_block中是否有img_caption
# 根据list长度,判断img_block中是否有img_caption
if
len
(
img_block
[
'img_caption_bbox'
])
>
0
:
if
len
(
img_block
[
'img_caption_bbox'
])
>
0
:
img_caption_block
,
img_caption_spans
=
merge_spans_to_block
(
img_caption_block
,
img_caption_spans
=
merge_spans_to_block
(
block
[
'spans'
],
img_block
[
'img_caption_bbox'
],
'img_caption_block'
block
[
'spans'
],
img_block
[
'img_caption_bbox'
],
BlockType
.
ImageCaption
)
)
block
[
'blocks'
]
.
append
(
img_caption_block
)
block
[
'blocks'
]
.
append
(
img_caption_block
)
...
@@ -69,7 +71,7 @@ def fix_table_block(block, table_blocks):
...
@@ -69,7 +71,7 @@ def fix_table_block(block, table_blocks):
for
span
in
block
[
'spans'
]:
for
span
in
block
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Table
and
span
[
'bbox'
]
==
table_block
[
'table_body_bbox'
]:
if
span
[
'type'
]
==
ContentType
.
Table
and
span
[
'bbox'
]
==
table_block
[
'table_body_bbox'
]:
# 创建table_body_block
# 创建table_body_block
table_body_block
=
make_body_block
(
span
,
table_block
[
'table_body_bbox'
],
'table_body_block'
)
table_body_block
=
make_body_block
(
span
,
table_block
[
'table_body_bbox'
],
BlockType
.
TableBody
)
block
[
'blocks'
]
.
append
(
table_body_block
)
block
[
'blocks'
]
.
append
(
table_body_block
)
# 从spans中移除img_body_block中已经放入的span
# 从spans中移除img_body_block中已经放入的span
...
@@ -79,7 +81,7 @@ def fix_table_block(block, table_blocks):
...
@@ -79,7 +81,7 @@ def fix_table_block(block, table_blocks):
# 根据list长度,判断table_block中是否有caption
# 根据list长度,判断table_block中是否有caption
if
len
(
table_block
[
'table_caption_bbox'
])
>
0
:
if
len
(
table_block
[
'table_caption_bbox'
])
>
0
:
table_caption_block
,
table_caption_spans
=
merge_spans_to_block
(
table_caption_block
,
table_caption_spans
=
merge_spans_to_block
(
block
[
'spans'
],
table_block
[
'table_caption_bbox'
],
'table_caption_block'
block
[
'spans'
],
table_block
[
'table_caption_bbox'
],
BlockType
.
TableCaption
)
)
block
[
'blocks'
]
.
append
(
table_caption_block
)
block
[
'blocks'
]
.
append
(
table_caption_block
)
...
@@ -92,10 +94,20 @@ def fix_table_block(block, table_blocks):
...
@@ -92,10 +94,20 @@ def fix_table_block(block, table_blocks):
# 根据list长度,判断table_block中是否有table_note
# 根据list长度,判断table_block中是否有table_note
if
len
(
table_block
[
'table_footnote_bbox'
])
>
0
:
if
len
(
table_block
[
'table_footnote_bbox'
])
>
0
:
table_footnote_block
,
table_footnote_spans
=
merge_spans_to_block
(
table_footnote_block
,
table_footnote_spans
=
merge_spans_to_block
(
block
[
'spans'
],
table_block
[
'table_footnote_bbox'
],
'table_footnote_block'
block
[
'spans'
],
table_block
[
'table_footnote_bbox'
],
BlockType
.
TableFootnote
)
)
block
[
'blocks'
]
.
append
(
table_footnote_block
)
block
[
'blocks'
]
.
append
(
table_footnote_block
)
break
break
del
block
[
'spans'
]
del
block
[
'spans'
]
return
block
return
block
def
fix_text_block
(
block
):
block_lines
=
merge_spans_to_line
(
block
[
'spans'
])
sort_block_lines
=
line_sort_spans_by_left_to_right
(
block_lines
)
block
[
'lines'
]
=
sort_block_lines
del
block
[
'spans'
]
return
block
magic_pdf/pre_proc/ocr_span_list_modify.py
View file @
91d296de
...
@@ -3,7 +3,7 @@ from loguru import logger
...
@@ -3,7 +3,7 @@ from loguru import logger
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
,
get_minbox_if_overlap_by_ratio
,
\
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
,
get_minbox_if_overlap_by_ratio
,
\
__is_overlaps_y_exceeds_threshold
__is_overlaps_y_exceeds_threshold
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
def
remove_overlaps_min_spans
(
spans
):
def
remove_overlaps_min_spans
(
spans
):
...
@@ -50,7 +50,8 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
...
@@ -50,7 +50,8 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
need_remove_spans
.
append
(
span
)
need_remove_spans
.
append
(
span
)
break
break
# 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
# 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
elif
drop_tag
==
DropTag
.
FOOTNOTE
and
(
span
[
'bbox'
][
1
]
+
span
[
'bbox'
][
3
])
/
2
>
removed_bbox
[
3
]
and
removed_bbox
[
0
]
<
(
span
[
'bbox'
][
0
]
+
span
[
'bbox'
][
2
])
/
2
<
removed_bbox
[
2
]:
elif
drop_tag
==
DropTag
.
FOOTNOTE
and
(
span
[
'bbox'
][
1
]
+
span
[
'bbox'
][
3
])
/
2
>
removed_bbox
[
3
]
and
\
removed_bbox
[
0
]
<
(
span
[
'bbox'
][
0
]
+
span
[
'bbox'
][
2
])
/
2
<
removed_bbox
[
2
]:
need_remove_spans
.
append
(
span
)
need_remove_spans
.
append
(
span
)
break
break
...
@@ -162,9 +163,10 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
...
@@ -162,9 +163,10 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
text_line
=
text_inline_lines
[
j
]
text_line
=
text_inline_lines
[
j
]
y0
,
y1
=
text_line
[
1
]
y0
,
y1
=
text_line
[
1
]
if
(
if
(
span_y0
<
y0
and
span_y
>
y0
or
span_y0
<
y1
and
span_y
>
y1
or
span_y0
<
y0
and
span_y
>
y1
)
and
__is_overlaps_y_exceeds_threshold
(
span_y0
<
y0
<
span_y
or
span_y0
<
y1
<
span_y
or
span_y0
<
y0
and
span_y
>
y1
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
)
and
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)
):
# 调整公式类型
# 调整公式类型
if
span
[
"type"
]
==
ContentType
.
InterlineEquation
:
if
span
[
"type"
]
==
ContentType
.
InterlineEquation
:
# 最后一行是行间公式
# 最后一行是行间公式
...
@@ -181,7 +183,7 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
...
@@ -181,7 +183,7 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
span
[
"bbox"
][
1
]
=
y0
span
[
"bbox"
][
1
]
=
y0
span
[
"bbox"
][
3
]
=
y1
span
[
"bbox"
][
3
]
=
y1
break
break
elif
span_y
<
y0
or
span_y0
<
y0
and
span_y
>
y0
and
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
elif
span_y
<
y0
or
span_y0
<
y0
<
span_y
and
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
(
0
,
y0
,
0
,
y1
)):
break
break
else
:
else
:
...
@@ -211,3 +213,19 @@ def get_qa_need_list(blocks):
...
@@ -211,3 +213,19 @@ def get_qa_need_list(blocks):
else
:
else
:
continue
continue
return
images
,
tables
,
interline_equations
,
inline_equations
return
images
,
tables
,
interline_equations
,
inline_equations
def
get_qa_need_list_v2
(
blocks
):
# 创建 images, tables, interline_equations, inline_equations 的副本
images
=
[]
tables
=
[]
interline_equations
=
[]
for
block
in
blocks
:
if
block
[
"type"
]
==
BlockType
.
Image
:
images
.
append
(
block
)
elif
block
[
"type"
]
==
BlockType
.
Table
:
tables
.
append
(
block
)
elif
block
[
"type"
]
==
BlockType
.
InterlineEquation
:
interline_equations
.
append
(
block
)
return
images
,
tables
,
interline_equations
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment