Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
7e8e9cab
Commit
7e8e9cab
authored
Apr 18, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
重构parse_by_ocr_v2
parent
7b0db8a4
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
346 additions
and
21 deletions
+346
-21
magicpdf.py
magic_pdf/cli/magicpdf.py
+1
-1
magic_model.py
magic_pdf/model/magic_model.py
+60
-5
parse_by_ocr_v2.py
magic_pdf/parse_by_ocr_v2.py
+63
-0
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+0
-3
UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+9
-9
ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+32
-0
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+80
-3
ocr_fix_block_logic.py
magic_pdf/pre_proc/ocr_fix_block_logic.py
+101
-0
No files found.
magic_pdf/cli/magicpdf.py
View file @
7e8e9cab
...
...
@@ -143,7 +143,7 @@ def pdf_command(pdf, model, method):
model
=
pdf
.
replace
(
".pdf"
,
".json"
)
if
not
os
.
path
.
exists
(
model
):
print
(
f
"make sure json file existed and place under {os.dirname(pdf)}"
)
os
.
e
ix
t
(
1
)
os
.
e
xi
t
(
1
)
def
read_fn
(
path
):
disk_rw
=
DiskReaderWriter
(
os
.
path
.
dirname
(
path
))
...
...
magic_pdf/model/magic_model.py
View file @
7e8e9cab
...
...
@@ -5,6 +5,7 @@ from loguru import logger
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
...
...
@@ -16,6 +17,7 @@ class MagicModel():
"""
def
__fix_axis
(
self
):
need_remove_list
=
[]
for
model_page_info
in
self
.
__model_list
:
page_no
=
model_page_info
[
'page_info'
][
'page_no'
]
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
model_page_info
,
self
.
__docs
[
page_no
])
...
...
@@ -29,6 +31,12 @@ class MagicModel():
int
(
y1
/
vertical_scale_ratio
),
]
layout_det
[
"bbox"
]
=
bbox
# 删除高度或者宽度为0的spans
if
bbox
[
2
]
-
bbox
[
0
]
==
0
or
bbox
[
3
]
-
bbox
[
1
]
==
0
:
need_remove_list
.
append
(
layout_det
)
for
need_remove
in
need_remove_list
:
layout_dets
.
remove
(
need_remove
)
def
__init__
(
self
,
model_list
:
list
,
docs
:
fitz
.
Document
):
self
.
__model_list
=
model_list
...
...
@@ -43,7 +51,6 @@ class MagicModel():
image_block
[
'bbox'
]
=
[
x0
,
y0
,
x1
,
y1
]
# 计算出来
image_block
[
'img_body_bbox'
]
=
[
x0
,
y0
,
x1
,
y1
]
image_blcok
[
'img_caption_bbox'
]
=
[
x0
,
y0
,
x1
,
y1
]
# 如果没有就是None,但是保证key存在
image_blcok
[
'img_caption_text'
]
=
[
x0
,
y0
,
x1
,
y1
]
# 如果没有就是空字符串,但是保证key存在
return
[
image_block
,
]
...
...
@@ -63,10 +70,58 @@ class MagicModel():
pass
# @凯文
def
get_ocr_text
(
self
,
page_no
:
int
)
->
list
:
# paddle 搞的,有字也有坐标
pass
# @小蒙
text_spans
=
[]
model_page_info
=
self
.
__model_list
[
page_no
]
layout_dets
=
model_page_info
[
"layout_dets"
]
for
layout_det
in
layout_dets
:
if
layout_det
[
"category_id"
]
==
"15"
:
span
=
{
"bbox"
:
layout_det
[
'bbox'
],
"content"
:
layout_det
[
"text"
],
}
text_spans
.
append
(
span
)
return
text_spans
def
get_ocr_spans
(
self
,
page_no
:
int
)
->
list
:
pass
# @小蒙
def
get_all_spans
(
self
,
page_no
:
int
)
->
list
:
all_spans
=
[]
model_page_info
=
self
.
__model_list
[
page_no
]
layout_dets
=
model_page_info
[
"layout_dets"
]
allow_category_id_list
=
[
3
,
5
,
13
,
14
,
15
]
"""当成span拼接的"""
# 3: 'image', # 图片
# 4: 'table', # 表格
# 13: 'inline_equation', # 行内公式
# 14: 'interline_equation', # 行间公式
# 15: 'text', # ocr识别文本
for
layout_det
in
layout_dets
:
category_id
=
layout_det
[
"category_id"
]
if
category_id
in
allow_category_id_list
:
span
=
{
"bbox"
:
layout_det
[
'bbox'
]
}
if
category_id
==
3
:
span
[
"type"
]
=
ContentType
.
Image
elif
category_id
==
5
:
span
[
"type"
]
=
ContentType
.
Table
elif
category_id
==
13
:
span
[
"content"
]
=
layout_det
[
"latex"
]
span
[
"type"
]
=
ContentType
.
InlineEquation
elif
category_id
==
14
:
span
[
"content"
]
=
layout_det
[
"latex"
]
span
[
"type"
]
=
ContentType
.
InterlineEquation
elif
category_id
==
15
:
span
[
"content"
]
=
layout_det
[
"text"
]
span
[
"type"
]
=
ContentType
.
Text
all_spans
.
append
(
span
)
return
all_spans
def
get_page_size
(
self
,
page_no
:
int
):
# 获取页面宽高
# 获取当前页的page对象
page
=
self
.
__docs
[
page_no
]
# 获取当前页的宽高
page_w
=
page
.
rect
.
width
page_h
=
page
.
rect
.
height
return
page_w
,
page_h
if
__name__
==
'__main__'
:
...
...
magic_pdf/parse_by_ocr_v2.py
0 → 100644
View file @
7e8e9cab
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.commons
import
fitz
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.pre_proc.cut_image
import
ocr_cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
def
parse_pdf_by_ocr
(
pdf_bytes
,
model_list
,
imageWriter
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
):
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
'''用model_list和docs对象初始化magic_model'''
magic_model
=
MagicModel
(
model_list
,
pdf_docs
)
'''根据输入的起始范围解析pdf'''
end_page_id
=
end_page_id
if
end_page_id
else
len
(
pdf_docs
)
-
1
for
page_id
in
range
(
start_page_id
,
end_page_id
+
1
):
'''从magic_model对象中获取后面会用到的区块信息'''
img_blocks
=
magic_model
.
get_imgs
(
page_id
)
table_blocks
=
magic_model
.
get_tables
(
page_id
)
discarded_blocks
=
magic_model
.
get_discarded
(
page_id
)
text_blocks
=
magic_model
.
get_text_blocks
(
page_id
)
title_blocks
=
magic_model
.
get_title_blocks
(
page_id
)
inline_equations
,
interline_equations
,
interline_equation_blocks
=
magic_model
.
get_equations
(
page_id
)
page_w
,
page_h
=
magic_model
.
get_page_size
(
page_id
)
'''将所有区块的bbox整理到一起'''
all_bboxes
=
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
)
'''根据区块信息计算layout'''
page_boundry
=
[
0
,
0
,
page_w
,
page_h
]
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
all_bboxes
,
page_boundry
,
page_id
)
'''根据layout顺序,对当前页面所有需要留下的block进行排序'''
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
'''获取所有需要拼接的span资源'''
spans
=
magic_model
.
get_all_spans
(
page_id
)
'''删除重叠spans中较小的那些'''
spans
,
dropped_spans_by_span_overlap
=
remove_overlaps_min_spans
(
spans
)
'''对image和table截图'''
spans
=
ocr_cut_image_and_table
(
spans
,
pdf_docs
[
page_id
],
page_id
,
pdf_bytes_md5
,
imageWriter
)
'''将span填入排好序的blocks中'''
block_with_spans
=
fill_spans_in_blocks
(
sorted_blocks
,
spans
)
'''对block进行fix操作'''
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
)
magic_pdf/pdf_parse_by_ocr.py
View file @
7e8e9cab
...
...
@@ -160,9 +160,6 @@ def parse_pdf_by_ocr(
'''bbox去除粘连'''
spans
=
remove_overlap_between_bbox
(
spans
)
'''用现有的bbox计算layout'''
'''
对tpye=["interline_equation", "image", "table"]进行额外处理,
如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
...
...
magic_pdf/pipe/UNIPipe.py
View file @
7e8e9cab
import
json
from
loguru
import
logger
from
magic_pdf.dict2md.mkcontent
import
mk_universal_format
,
mk_mm_markdown
from
magic_pdf.dict2md.ocr_mkcontent
import
make_standard_format_with_para
,
ocr_mk_mm_markdown_with_para
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
from
magic_pdf.user_api
import
parse_union_pdf
,
parse_ocr_pdf
class
UNIPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
,
is_debug
:
bool
=
False
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
img_parent_path
:
str
,
is_debug
:
bool
=
False
):
self
.
pdf_type
=
self
.
PIP_OCR
super
()
.
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
img_parent_path
,
is_debug
)
...
...
@@ -24,9 +20,11 @@ class UNIPipe(AbsPipe):
def
pipe_parse
(
self
):
if
self
.
pdf_type
==
self
.
PIP_TXT
:
self
.
pdf_mid_data
=
parse_union_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
self
.
pdf_mid_data
=
parse_union_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
)
def
pipe_mk_uni_format
(
self
):
content_list
=
AbsPipe
.
mk_uni_format
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
...
...
@@ -36,6 +34,7 @@ class UNIPipe(AbsPipe):
markdown_content
=
AbsPipe
.
mk_markdown
(
self
.
get_compress_pdf_mid_data
(),
self
.
img_parent_path
)
return
markdown_content
if
__name__
==
'__main__'
:
# 测试
drw
=
DiskReaderWriter
(
r"D:/project/20231108code-clean"
)
...
...
@@ -60,5 +59,6 @@ if __name__ == '__main__':
md_writer
=
DiskReaderWriter
(
write_path
)
md_writer
.
write
(
md_content
,
"19983-00.md"
,
AbsReaderWriter
.
MODE_TXT
)
md_writer
.
write
(
json
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
),
"19983-00.json"
,
AbsReaderWriter
.
MODE_TXT
)
md_writer
.
write
(
json
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
),
"19983-00.json"
,
AbsReaderWriter
.
MODE_TXT
)
md_writer
.
write
(
str
(
content_list
),
"19983-00.txt"
,
AbsReaderWriter
.
MODE_TXT
)
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
0 → 100644
View file @
7e8e9cab
def
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
all_bboxes
=
[]
for
image
in
img_blocks
:
x0
,
y0
,
x1
,
y1
=
image
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
'image_block'
,
None
,
None
,
None
,
None
])
for
table
in
table_blocks
:
x0
,
y0
,
x1
,
y1
=
table
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
'table_block'
,
None
,
None
,
None
,
None
])
for
text
in
text_blocks
:
x0
,
y0
,
x1
,
y1
=
text
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
'text_block'
,
None
,
None
,
None
,
None
])
for
title
in
title_blocks
:
x0
,
y0
,
x1
,
y1
=
title
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
'title_block'
,
None
,
None
,
None
,
None
])
for
interline_equation
in
interline_equation_blocks
:
x0
,
y0
,
x1
,
y1
=
interline_equation
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
'interline_equation_block'
,
None
,
None
,
None
,
None
])
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50
%
区域的(限定footnote)'''
for
discarded
in
discarded_blocks
:
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
'footnote'
,
None
,
None
,
None
,
None
])
return
all_bboxes
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
7e8e9cab
...
...
@@ -4,6 +4,7 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox
calculate_overlap_area_in_bbox1_area_ratio
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.pre_proc.ocr_fix_block_logic
import
fix_image_block
,
fix_table_block
# 将每一个line中的span从左到右排序
...
...
@@ -24,6 +25,7 @@ def line_sort_spans_by_left_to_right(lines):
})
return
line_objects
def
merge_spans_to_line
(
spans
):
if
len
(
spans
)
==
0
:
return
[]
...
...
@@ -37,7 +39,8 @@ def merge_spans_to_line(spans):
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if
span
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
or
any
(
s
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
for
s
in
current_line
):
s
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
for
s
in
current_line
):
# 则开始新行
lines
.
append
(
current_line
)
current_line
=
[
span
]
...
...
@@ -57,6 +60,7 @@ def merge_spans_to_line(spans):
return
lines
def
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
):
lines
=
[]
new_spans
=
[]
...
...
@@ -103,7 +107,80 @@ def merge_lines_to_block(lines):
return
blocks
def
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
):
new_blocks
=
[]
sort_blocks
=
[]
for
item
in
layout_bboxes
:
layout_bbox
=
item
[
'layout_bbox'
]
# 遍历blocks,将每个blocks放入对应的layout中
layout_blocks
=
[]
for
block
in
all_bboxes
:
# 如果是footnote则跳过
if
block
[
7
]
==
'footnote'
:
continue
block_bbox
=
[
block
[
0
],
block
[
1
],
block
[
2
],
block
[
3
]]
if
calculate_overlap_area_in_bbox1_area_ratio
(
block_bbox
,
layout_bbox
)
>
0.8
:
layout_blocks
.
append
(
block
)
# 如果layout_blocks不为空,则放入new_blocks中
if
len
(
layout_blocks
)
>
0
:
new_blocks
.
append
(
layout_blocks
)
# 从spans删除已经放入layout_sapns中的span
for
layout_block
in
layout_blocks
:
all_bboxes
.
remove
(
layout_block
)
# 如果new_blocks不为空,则对new_blocks中每个block进行排序
if
len
(
new_blocks
)
>
0
:
for
bboxes_in_layout_block
in
new_blocks
:
bboxes_in_layout_block
.
sort
(
key
=
lambda
x
:
x
[
1
])
# 一个layout内部的box,按照y0自上而下排序
sort_blocks
.
extend
(
bboxes_in_layout_block
)
# sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
return
sort_blocks
def
fill_spans_in_blocks
(
blocks
,
spans
):
block_with_spans
=
[]
for
block
in
blocks
:
block_type
=
block
[
7
]
block_bbox
=
block
[
0
:
4
]
block_dict
=
{
'block_type'
:
block_type
,
'bbox'
:
block_bbox
,
}
block_spans
=
[]
for
span
in
spans
:
span_bbox
=
span
[
'bbox'
]
if
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.8
:
block_spans
.
append
(
span
)
block_dict
[
'spans'
]
=
block_spans
block_with_spans
.
append
(
block_dict
)
# 从spans删除已经放入block_spans中的span
if
len
(
block_spans
)
>
0
:
for
span
in
block_spans
:
spans
.
remove
(
span
)
return
block_with_spans
def
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
):
fix_blocks
=
[]
for
block
in
block_with_spans
:
block_type
=
block
[
'block_type'
]
# 只有type为image_block和table_block才需要处理
if
block_type
==
'image_block'
:
block
=
fix_image_block
(
block
,
img_blocks
)
elif
block_type
==
'table_block'
:
block
=
fix_table_block
(
block
,
table_blocks
)
elif
block_type
==
'text_block'
:
pass
elif
block_type
==
'title_block'
:
pass
elif
block_type
==
'interline_equation_block'
:
pass
else
:
continue
fix_blocks
.
append
(
block
)
return
fix_blocks
magic_pdf/pre_proc/ocr_fix_block_logic.py
0 → 100644
View file @
7e8e9cab
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.pre_proc.ocr_dict_merge
import
merge_spans_to_line
,
line_sort_spans_by_left_to_right
def
merge_spans_to_block
(
spans
:
list
,
block_bbox
:
list
,
block_type
:
str
):
block_spans
=
[]
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
for
span
in
spans
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
block_bbox
)
>
0.8
:
block_spans
.
append
(
span
)
block_lines
=
merge_spans_to_line
(
block_spans
)
# 对line中的span进行排序
sort_block_lines
=
line_sort_spans_by_left_to_right
(
block_lines
)
block
=
{
'bbox'
:
block_bbox
,
'block_type'
:
block_type
,
'lines'
:
sort_block_lines
}
return
block
,
block_spans
def
make_body_block
(
span
:
dict
,
block_bbox
:
list
,
block_type
:
str
):
# 创建body_block
body_line
=
{
'bbox'
:
block_bbox
,
'spans'
:
[
span
],
}
body_block
=
{
'bbox'
:
block_bbox
,
'block_type'
:
block_type
,
'lines'
:
[
body_line
]
}
return
body_block
def
fix_image_block
(
block
,
img_blocks
):
block
[
'blocks'
]
=
[]
# 遍历img_blocks,找到与当前block匹配的img_block
for
img_block
in
img_blocks
:
if
img_block
[
'bbox'
]
==
block
[
'bbox'
]:
# 创建img_body_block
for
span
in
block
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Image
and
span
[
'bbox'
]
==
img_block
[
'img_body_bbox'
]:
# 创建img_body_block
img_body_block
=
make_body_block
(
span
,
img_block
[
'img_body_bbox'
],
'img_body_block'
)
block
[
'blocks'
]
.
append
(
img_body_block
)
# 从spans中移除img_body_block中已经放入的span
block
[
'spans'
]
.
remove
(
span
)
break
# 根据list长度,判断img_block中是否有img_caption
if
len
(
img_block
[
'img_caption_bbox'
])
>
0
:
img_caption_block
,
img_caption_spans
=
merge_spans_to_block
(
block
[
'spans'
],
img_block
[
'img_caption_bbox'
],
'img_caption_block'
)
block
[
'blocks'
]
.
append
(
img_caption_block
)
break
del
block
[
'spans'
]
return
block
def
fix_table_block
(
block
,
table_blocks
):
block
[
'blocks'
]
=
[]
# 遍历table_blocks,找到与当前block匹配的table_block
for
table_block
in
table_blocks
:
if
table_block
[
'bbox'
]
==
block
[
'bbox'
]:
# 创建table_body_block
for
span
in
block
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Table
and
span
[
'bbox'
]
==
table_block
[
'table_body_bbox'
]:
# 创建table_body_block
table_body_block
=
make_body_block
(
span
,
table_block
[
'table_body_bbox'
],
'table_body_block'
)
block
[
'blocks'
]
.
append
(
table_body_block
)
# 从spans中移除img_body_block中已经放入的span
block
[
'spans'
]
.
remove
(
span
)
break
# 根据list长度,判断table_block中是否有caption
if
len
(
table_block
[
'table_caption_bbox'
])
>
0
:
table_caption_block
,
table_caption_spans
=
merge_spans_to_block
(
block
[
'spans'
],
table_block
[
'table_caption_bbox'
],
'table_caption_block'
)
block
[
'blocks'
]
.
append
(
table_caption_block
)
# 如果table_caption_block_spans不为空
if
len
(
table_caption_spans
)
>
0
:
# 一些span已经放入了caption_block中,需要从block['spans']中删除
for
span
in
table_caption_spans
:
block
[
'spans'
]
.
remove
(
span
)
# 根据list长度,判断table_block中是否有table_note
if
len
(
table_block
[
'table_footnote_bbox'
])
>
0
:
table_footnote_block
,
table_footnote_spans
=
merge_spans_to_block
(
block
[
'spans'
],
table_block
[
'table_footnote_bbox'
],
'table_footnote_block'
)
block
[
'blocks'
]
.
append
(
table_footnote_block
)
break
del
block
[
'spans'
]
return
block
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment