Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
25a6d4ba
Unverified
Commit
25a6d4ba
authored
Oct 25, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Oct 25, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #785 from myhloli/fix-imgs-block
refactor(parse_core): improve image and table block handling
parents
c3cdf6f8
c34c9d21
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
167 additions
and
63 deletions
+167
-63
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+10
-10
pdf_extract_kit.py
magic_pdf/model/pdf_extract_kit.py
+1
-1
pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+99
-28
ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+31
-24
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+26
-0
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
25a6d4ba
...
@@ -70,17 +70,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
...
@@ -70,17 +70,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text
+=
f
"
\n
})
\n
"
para_text
+=
f
"
\n
})
\n
"
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
para_text
+=
merge_para_with_text
(
block
)
para_text
+=
merge_para_with_text
(
block
)
+
'
\n
'
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
para_text
+=
merge_para_with_text
(
block
)
para_text
+=
merge_para_with_text
(
block
)
+
'
\n
'
elif
para_type
==
BlockType
.
Table
:
elif
para_type
==
BlockType
.
Table
:
if
mode
==
'nlp'
:
if
mode
==
'nlp'
:
continue
continue
elif
mode
==
'mm'
:
elif
mode
==
'mm'
:
for
block
in
para_block
[
'blocks'
]:
# 1st.拼table_caption
for
block
in
para_block
[
'blocks'
]:
# 1st.拼table_caption
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
para_text
+=
merge_para_with_text
(
block
)
para_text
+=
merge_para_with_text
(
block
)
+
'
\n
'
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼table_body
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼table_body
if
block
[
'type'
]
==
BlockType
.
TableBody
:
if
block
[
'type'
]
==
BlockType
.
TableBody
:
for
line
in
block
[
'lines'
]:
for
line
in
block
[
'lines'
]:
...
@@ -95,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
...
@@ -95,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text
+=
f
"
\n
})
\n
"
para_text
+=
f
"
\n
})
\n
"
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼table_footnote
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼table_footnote
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
para_text
+=
merge_para_with_text
(
block
)
para_text
+=
merge_para_with_text
(
block
)
+
'
\n
'
if
para_text
.
strip
()
==
''
:
if
para_text
.
strip
()
==
''
:
continue
continue
...
@@ -180,18 +180,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
...
@@ -180,18 +180,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
'text_format'
:
'latex'
,
'text_format'
:
'latex'
,
}
}
elif
para_type
==
BlockType
.
Image
:
elif
para_type
==
BlockType
.
Image
:
para_content
=
{
'type'
:
'image'
}
para_content
=
{
'type'
:
'image'
,
'img_caption'
:
[],
'img_footnote'
:
[]
}
for
block
in
para_block
[
'blocks'
]:
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
ImageBody
:
if
block
[
'type'
]
==
BlockType
.
ImageBody
:
para_content
[
'img_path'
]
=
join_path
(
para_content
[
'img_path'
]
=
join_path
(
img_buket_path
,
img_buket_path
,
block
[
'lines'
][
0
][
'spans'
][
0
][
'image_path'
])
block
[
'lines'
][
0
][
'spans'
][
0
][
'image_path'
])
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
para_content
[
'img_caption'
]
=
merge_para_with_text
(
block
)
para_content
[
'img_caption'
]
.
append
(
merge_para_with_text
(
block
)
)
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
para_content
[
'img_footnote'
]
=
merge_para_with_text
(
block
)
para_content
[
'img_footnote'
]
.
append
(
merge_para_with_text
(
block
)
)
elif
para_type
==
BlockType
.
Table
:
elif
para_type
==
BlockType
.
Table
:
para_content
=
{
'type'
:
'table'
}
para_content
=
{
'type'
:
'table'
,
'table_caption'
:
[],
'table_footnote'
:
[]
}
for
block
in
para_block
[
'blocks'
]:
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
TableBody
:
if
block
[
'type'
]
==
BlockType
.
TableBody
:
if
block
[
"lines"
][
0
][
"spans"
][
0
]
.
get
(
'latex'
,
''
):
if
block
[
"lines"
][
0
][
"spans"
][
0
]
.
get
(
'latex'
,
''
):
...
@@ -200,9 +200,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
...
@@ -200,9 +200,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
para_content
[
'table_body'
]
=
f
"
\n\n
{block['lines'][0]['spans'][0]['html']}
\n\n
"
para_content
[
'table_body'
]
=
f
"
\n\n
{block['lines'][0]['spans'][0]['html']}
\n\n
"
para_content
[
'img_path'
]
=
join_path
(
img_buket_path
,
block
[
"lines"
][
0
][
"spans"
][
0
][
'image_path'
])
para_content
[
'img_path'
]
=
join_path
(
img_buket_path
,
block
[
"lines"
][
0
][
"spans"
][
0
][
'image_path'
])
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
para_content
[
'table_caption'
]
=
merge_para_with_text
(
block
)
para_content
[
'table_caption'
]
.
append
(
merge_para_with_text
(
block
)
)
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
para_content
[
'table_footnote'
]
=
merge_para_with_text
(
block
)
para_content
[
'table_footnote'
]
.
append
(
merge_para_with_text
(
block
)
)
para_content
[
'page_idx'
]
=
page_idx
para_content
[
'page_idx'
]
=
page_idx
...
...
magic_pdf/model/pdf_extract_kit.py
View file @
25a6d4ba
...
@@ -314,7 +314,7 @@ class CustomPEKModel:
...
@@ -314,7 +314,7 @@ class CustomPEKModel:
elif
self
.
layout_model_name
==
MODEL_NAME
.
DocLayout_YOLO
:
elif
self
.
layout_model_name
==
MODEL_NAME
.
DocLayout_YOLO
:
# doclayout_yolo
# doclayout_yolo
layout_res
=
[]
layout_res
=
[]
doclayout_yolo_res
=
self
.
layout_model
.
predict
(
image
,
imgsz
=
1024
,
conf
=
0.
1
5
,
iou
=
0.45
,
verbose
=
True
,
device
=
self
.
device
)[
0
]
doclayout_yolo_res
=
self
.
layout_model
.
predict
(
image
,
imgsz
=
1024
,
conf
=
0.
2
5
,
iou
=
0.45
,
verbose
=
True
,
device
=
self
.
device
)[
0
]
for
xyxy
,
conf
,
cla
in
zip
(
doclayout_yolo_res
.
boxes
.
xyxy
.
cpu
(),
doclayout_yolo_res
.
boxes
.
conf
.
cpu
(),
doclayout_yolo_res
.
boxes
.
cls
.
cpu
()):
for
xyxy
,
conf
,
cla
in
zip
(
doclayout_yolo_res
.
boxes
.
xyxy
.
cpu
(),
doclayout_yolo_res
.
boxes
.
conf
.
cpu
(),
doclayout_yolo_res
.
boxes
.
cls
.
cpu
()):
xmin
,
ymin
,
xmax
,
ymax
=
[
int
(
p
.
item
())
for
p
in
xyxy
]
xmin
,
ymin
,
xmax
,
ymax
=
[
int
(
p
.
item
())
for
p
in
xyxy
]
new_item
=
{
new_item
=
{
...
...
magic_pdf/pdf_parse_union_core_v2.py
View file @
25a6d4ba
import
copy
import
os
import
os
import
statistics
import
statistics
import
time
import
time
...
@@ -15,7 +16,7 @@ from magic_pdf.libs.convert_utils import dict_to_list
...
@@ -15,7 +16,7 @@ from magic_pdf.libs.convert_utils import dict_to_list
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.local_math
import
float_equal
from
magic_pdf.libs.local_math
import
float_equal
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.para.para_split_v3
import
para_split
from
magic_pdf.para.para_split_v3
import
para_split
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
...
@@ -29,7 +30,7 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
...
@@ -29,7 +30,7 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
ocr_prepare_bboxes_for_layout_split_v2
ocr_prepare_bboxes_for_layout_split_v2
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
fill_spans_in_blocks
,
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
fill_spans_in_blocks
,
fix_block_spans
,
fix_block_spans
,
fix_discarded_block
)
fix_discarded_block
,
fix_block_spans_v2
)
from
magic_pdf.pre_proc.ocr_span_list_modify
import
(
from
magic_pdf.pre_proc.ocr_span_list_modify
import
(
get_qa_need_list_v2
,
remove_overlaps_low_confidence_spans
,
get_qa_need_list_v2
,
remove_overlaps_low_confidence_spans
,
remove_overlaps_min_spans
)
remove_overlaps_min_spans
)
...
@@ -173,19 +174,6 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
...
@@ -173,19 +174,6 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
def
cal_block_index
(
fix_blocks
,
sorted_bboxes
):
def
cal_block_index
(
fix_blocks
,
sorted_bboxes
):
for
block
in
fix_blocks
:
for
block
in
fix_blocks
:
# if block['type'] in ['text', 'title', 'interline_equation']:
# line_index_list = []
# if len(block['lines']) == 0:
# block['index'] = sorted_bboxes.index(block['bbox'])
# else:
# for line in block['lines']:
# line['index'] = sorted_bboxes.index(line['bbox'])
# line_index_list.append(line['index'])
# median_value = statistics.median(line_index_list)
# block['index'] = median_value
#
# elif block['type'] in ['table', 'image']:
# block['index'] = sorted_bboxes.index(block['bbox'])
line_index_list
=
[]
line_index_list
=
[]
if
len
(
block
[
'lines'
])
==
0
:
if
len
(
block
[
'lines'
])
==
0
:
...
@@ -197,9 +185,11 @@ def cal_block_index(fix_blocks, sorted_bboxes):
...
@@ -197,9 +185,11 @@ def cal_block_index(fix_blocks, sorted_bboxes):
median_value
=
statistics
.
median
(
line_index_list
)
median_value
=
statistics
.
median
(
line_index_list
)
block
[
'index'
]
=
median_value
block
[
'index'
]
=
median_value
# 删除图表block中的虚拟line信息
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
if
block
[
'type'
]
in
[
'table'
,
'image'
]:
if
block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
]:
del
block
[
'lines'
]
block
[
'virtual_lines'
]
=
copy
.
deepcopy
(
block
[
'lines'
])
block
[
'lines'
]
=
copy
.
deepcopy
(
block
[
'real_lines'
])
del
block
[
'real_lines'
]
return
fix_blocks
return
fix_blocks
...
@@ -250,7 +240,11 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
...
@@ -250,7 +240,11 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
def
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
):
def
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
):
page_line_list
=
[]
page_line_list
=
[]
for
block
in
fix_blocks
:
for
block
in
fix_blocks
:
if
block
[
'type'
]
in
[
'text'
,
'title'
,
'interline_equation'
]:
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
InterlineEquation
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
if
len
(
block
[
'lines'
])
==
0
:
if
len
(
block
[
'lines'
])
==
0
:
bbox
=
block
[
'bbox'
]
bbox
=
block
[
'bbox'
]
lines
=
insert_lines_into_block
(
bbox
,
line_height
,
page_w
,
page_h
)
lines
=
insert_lines_into_block
(
bbox
,
line_height
,
page_w
,
page_h
)
...
@@ -261,8 +255,9 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
...
@@ -261,8 +255,9 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
for
line
in
block
[
'lines'
]:
for
line
in
block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
bbox
=
line
[
'bbox'
]
page_line_list
.
append
(
bbox
)
page_line_list
.
append
(
bbox
)
elif
block
[
'type'
]
in
[
'table'
,
'image'
]:
elif
block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
]:
bbox
=
block
[
'bbox'
]
bbox
=
block
[
'bbox'
]
block
[
"real_lines"
]
=
copy
.
deepcopy
(
block
[
'lines'
])
lines
=
insert_lines_into_block
(
bbox
,
line_height
,
page_w
,
page_h
)
lines
=
insert_lines_into_block
(
bbox
,
line_height
,
page_w
,
page_h
)
block
[
'lines'
]
=
[]
block
[
'lines'
]
=
[]
for
line
in
lines
:
for
line
in
lines
:
...
@@ -316,7 +311,11 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
...
@@ -316,7 +311,11 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
def
get_line_height
(
blocks
):
def
get_line_height
(
blocks
):
page_line_height_list
=
[]
page_line_height_list
=
[]
for
block
in
blocks
:
for
block
in
blocks
:
if
block
[
'type'
]
in
[
'text'
,
'title'
,
'interline_equation'
]:
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
for
line
in
block
[
'lines'
]:
for
line
in
block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
bbox
=
line
[
'bbox'
]
page_line_height_list
.
append
(
int
(
bbox
[
3
]
-
bbox
[
1
]))
page_line_height_list
.
append
(
int
(
bbox
[
3
]
-
bbox
[
1
]))
...
@@ -326,6 +325,63 @@ def get_line_height(blocks):
...
@@ -326,6 +325,63 @@ def get_line_height(blocks):
return
10
return
10
def
process_groups
(
groups
,
body_key
,
caption_key
,
footnote_key
):
body_blocks
=
[]
caption_blocks
=
[]
footnote_blocks
=
[]
for
i
,
group
in
enumerate
(
groups
):
group
[
body_key
][
'group_id'
]
=
i
body_blocks
.
append
(
group
[
body_key
])
for
caption_block
in
group
[
caption_key
]:
caption_block
[
'group_id'
]
=
i
caption_blocks
.
append
(
caption_block
)
for
footnote_block
in
group
[
footnote_key
]:
footnote_block
[
'group_id'
]
=
i
footnote_blocks
.
append
(
footnote_block
)
return
body_blocks
,
caption_blocks
,
footnote_blocks
def
process_block_list
(
blocks
,
body_type
,
block_type
):
indices
=
[
block
[
'index'
]
for
block
in
blocks
]
median_index
=
statistics
.
median
(
indices
)
body_bbox
=
next
((
block
[
'bbox'
]
for
block
in
blocks
if
block
.
get
(
'type'
)
==
body_type
),
[])
return
{
'type'
:
block_type
,
'bbox'
:
body_bbox
,
'blocks'
:
blocks
,
'index'
:
median_index
,
}
def
revert_group_blocks
(
blocks
):
image_groups
=
{}
table_groups
=
{}
new_blocks
=
[]
for
block
in
blocks
:
if
block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
]:
group_id
=
block
[
'group_id'
]
if
group_id
not
in
image_groups
:
image_groups
[
group_id
]
=
[]
image_groups
[
group_id
]
.
append
(
block
)
elif
block
[
'type'
]
in
[
BlockType
.
TableBody
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
group_id
=
block
[
'group_id'
]
if
group_id
not
in
table_groups
:
table_groups
[
group_id
]
=
[]
table_groups
[
group_id
]
.
append
(
block
)
else
:
new_blocks
.
append
(
block
)
for
group_id
,
blocks
in
image_groups
.
items
():
new_blocks
.
append
(
process_block_list
(
blocks
,
BlockType
.
ImageBody
,
BlockType
.
Image
))
for
group_id
,
blocks
in
table_groups
.
items
():
new_blocks
.
append
(
process_block_list
(
blocks
,
BlockType
.
TableBody
,
BlockType
.
Table
))
return
new_blocks
def
parse_page_core
(
def
parse_page_core
(
page_doc
:
PageableData
,
magic_model
,
page_id
,
pdf_bytes_md5
,
imageWriter
,
parse_mode
page_doc
:
PageableData
,
magic_model
,
page_id
,
pdf_bytes_md5
,
imageWriter
,
parse_mode
):
):
...
@@ -333,8 +389,20 @@ def parse_page_core(
...
@@ -333,8 +389,20 @@ def parse_page_core(
drop_reason
=
[]
drop_reason
=
[]
"""从magic_model对象中获取后面会用到的区块信息"""
"""从magic_model对象中获取后面会用到的区块信息"""
img_blocks
=
magic_model
.
get_imgs
(
page_id
)
# img_blocks = magic_model.get_imgs(page_id)
table_blocks
=
magic_model
.
get_tables
(
page_id
)
# table_blocks = magic_model.get_tables(page_id)
img_groups
=
magic_model
.
get_imgs_v2
(
page_id
)
table_groups
=
magic_model
.
get_tables_v2
(
page_id
)
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
=
process_groups
(
img_groups
,
'image_body'
,
'image_caption_list'
,
'image_footnote_list'
)
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
=
process_groups
(
table_groups
,
'table_body'
,
'table_caption_list'
,
'table_footnote_list'
)
discarded_blocks
=
magic_model
.
get_discarded
(
page_id
)
discarded_blocks
=
magic_model
.
get_discarded
(
page_id
)
text_blocks
=
magic_model
.
get_text_blocks
(
page_id
)
text_blocks
=
magic_model
.
get_text_blocks
(
page_id
)
title_blocks
=
magic_model
.
get_title_blocks
(
page_id
)
title_blocks
=
magic_model
.
get_title_blocks
(
page_id
)
...
@@ -370,8 +438,8 @@ def parse_page_core(
...
@@ -370,8 +438,8 @@ def parse_page_core(
interline_equation_blocks
=
[]
interline_equation_blocks
=
[]
if
len
(
interline_equation_blocks
)
>
0
:
if
len
(
interline_equation_blocks
)
>
0
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_blocks
,
img_b
ody_blocks
,
img_caption_blocks
,
img_footnote_b
locks
,
table_blocks
,
table_b
ody_blocks
,
table_caption_blocks
,
table_footnote_b
locks
,
discarded_blocks
,
discarded_blocks
,
text_blocks
,
text_blocks
,
title_blocks
,
title_blocks
,
...
@@ -381,8 +449,8 @@ def parse_page_core(
...
@@ -381,8 +449,8 @@ def parse_page_core(
)
)
else
:
else
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_blocks
,
img_b
ody_blocks
,
img_caption_blocks
,
img_footnote_b
locks
,
table_blocks
,
table_b
ody_blocks
,
table_caption_blocks
,
table_footnote_b
locks
,
discarded_blocks
,
discarded_blocks
,
text_blocks
,
text_blocks
,
title_blocks
,
title_blocks
,
...
@@ -419,7 +487,7 @@ def parse_page_core(
...
@@ -419,7 +487,7 @@ def parse_page_core(
block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_bboxes
,
spans
,
0.5
)
block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_bboxes
,
spans
,
0.5
)
"""对block进行fix操作"""
"""对block进行fix操作"""
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_block
s
)
fix_blocks
=
fix_block_spans
_v2
(
block_with_span
s
)
"""获取所有line并计算正文line的高度"""
"""获取所有line并计算正文line的高度"""
line_height
=
get_line_height
(
fix_blocks
)
line_height
=
get_line_height
(
fix_blocks
)
...
@@ -430,6 +498,9 @@ def parse_page_core(
...
@@ -430,6 +498,9 @@ def parse_page_core(
"""根据line的中位数算block的序列关系"""
"""根据line的中位数算block的序列关系"""
fix_blocks
=
cal_block_index
(
fix_blocks
,
sorted_bboxes
)
fix_blocks
=
cal_block_index
(
fix_blocks
,
sorted_bboxes
)
"""将image和table的block还原回group形式参与后续流程"""
fix_blocks
=
revert_group_blocks
(
fix_blocks
)
"""重排block"""
"""重排block"""
sorted_blocks
=
sorted
(
fix_blocks
,
key
=
lambda
b
:
b
[
'index'
])
sorted_blocks
=
sorted
(
fix_blocks
,
key
=
lambda
b
:
b
[
'index'
])
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
25a6d4ba
...
@@ -60,29 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
...
@@ -60,29 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
return
all_bboxes
,
all_discarded_blocks
,
drop_reasons
return
all_bboxes
,
all_discarded_blocks
,
drop_reasons
def
ocr_prepare_bboxes_for_layout_split_v2
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
def
add_bboxes
(
blocks
,
block_type
,
bboxes
):
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
for
block
in
blocks
:
x0
,
y0
,
x1
,
y1
=
block
[
'bbox'
]
if
block_type
in
[
BlockType
.
ImageBody
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableBody
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
block_type
,
None
,
None
,
None
,
None
,
block
[
"score"
],
block
[
"group_id"
]])
else
:
bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
block_type
,
None
,
None
,
None
,
None
,
block
[
"score"
]])
def
ocr_prepare_bboxes_for_layout_split_v2
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
all_bboxes
=
[]
all_bboxes
=
[]
all_discarded_blocks
=
[]
for
image
in
img_blocks
:
x0
,
y0
,
x1
,
y1
=
image
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Image
,
None
,
None
,
None
,
None
,
image
[
"score"
]])
for
table
in
table_blocks
:
add_bboxes
(
img_body_blocks
,
BlockType
.
ImageBody
,
all_bboxes
)
x0
,
y0
,
x1
,
y1
=
table
[
'bbox'
]
add_bboxes
(
img_caption_blocks
,
BlockType
.
ImageCaption
,
all_bboxes
)
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Table
,
None
,
None
,
None
,
None
,
table
[
"score"
]])
add_bboxes
(
img_footnote_blocks
,
BlockType
.
ImageFootnote
,
all_bboxes
)
add_bboxes
(
table_body_blocks
,
BlockType
.
TableBody
,
all_bboxes
)
for
text
in
text_blocks
:
add_bboxes
(
table_caption_blocks
,
BlockType
.
TableCaption
,
all_bboxes
)
x0
,
y0
,
x1
,
y1
=
text
[
'bbox'
]
add_bboxes
(
table_footnote_blocks
,
BlockType
.
TableFootnote
,
all_bboxes
)
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Text
,
None
,
None
,
None
,
None
,
text
[
"score"
]])
add_bboxes
(
text_blocks
,
BlockType
.
Text
,
all_bboxes
)
add_bboxes
(
title_blocks
,
BlockType
.
Title
,
all_bboxes
)
for
title
in
title_blocks
:
add_bboxes
(
interline_equation_blocks
,
BlockType
.
InterlineEquation
,
all_bboxes
)
x0
,
y0
,
x1
,
y1
=
title
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Title
,
None
,
None
,
None
,
None
,
title
[
"score"
]])
for
interline_equation
in
interline_equation_blocks
:
x0
,
y0
,
x1
,
y1
=
interline_equation
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
InterlineEquation
,
None
,
None
,
None
,
None
,
interline_equation
[
"score"
]])
'''block嵌套问题解决'''
'''block嵌套问题解决'''
'''文本框与标题框重叠,优先信任文本框'''
'''文本框与标题框重叠,优先信任文本框'''
...
@@ -96,12 +101,14 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
...
@@ -96,12 +101,14 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
# 通过后续大框套小框逻辑删除
# 通过后续大框套小框逻辑删除
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50
%
区域的(限定footnote)'''
'''discarded_blocks'''
all_discarded_blocks
=
[]
add_bboxes
(
discarded_blocks
,
BlockType
.
Discarded
,
all_discarded_blocks
)
'''footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50
%
区域的'''
footnote_blocks
=
[]
footnote_blocks
=
[]
for
discarded
in
discarded_blocks
:
for
discarded
in
discarded_blocks
:
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
all_discarded_blocks
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Discarded
,
None
,
None
,
None
,
None
,
discarded
[
"score"
]])
# 将footnote加入到all_bboxes中,用来计算layout
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
footnote_blocks
.
append
([
x0
,
y0
,
x1
,
y1
])
footnote_blocks
.
append
([
x0
,
y0
,
x1
,
y1
])
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
25a6d4ba
...
@@ -153,6 +153,11 @@ def fill_spans_in_blocks(blocks, spans, radio):
...
@@ -153,6 +153,11 @@ def fill_spans_in_blocks(blocks, spans, radio):
'type'
:
block_type
,
'type'
:
block_type
,
'bbox'
:
block_bbox
,
'bbox'
:
block_bbox
,
}
}
if
block_type
in
[
BlockType
.
ImageBody
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableBody
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
block_dict
[
"group_id"
]
=
block
[
-
1
]
block_spans
=
[]
block_spans
=
[]
for
span
in
spans
:
for
span
in
spans
:
span_bbox
=
span
[
'bbox'
]
span_bbox
=
span
[
'bbox'
]
...
@@ -201,6 +206,27 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
...
@@ -201,6 +206,27 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
return
fix_blocks
return
fix_blocks
def
fix_block_spans_v2
(
block_with_spans
):
"""1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
fix_blocks
=
[]
for
block
in
block_with_spans
:
block_type
=
block
[
'type'
]
if
block_type
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
block
=
fix_text_block
(
block
)
elif
block_type
in
[
BlockType
.
InterlineEquation
,
BlockType
.
ImageBody
,
BlockType
.
TableBody
]:
block
=
fix_interline_block
(
block
)
else
:
continue
fix_blocks
.
append
(
block
)
return
fix_blocks
def
fix_discarded_block
(
discarded_block_with_spans
):
def
fix_discarded_block
(
discarded_block_with_spans
):
fix_discarded_blocks
=
[]
fix_discarded_blocks
=
[]
for
block
in
discarded_block_with_spans
:
for
block
in
discarded_block_with_spans
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment