Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
85587b25
Commit
85587b25
authored
Mar 13, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
在dict中加入qa需要的字段
parent
b560c18f
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
119 additions
and
32 deletions
+119
-32
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+40
-20
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+13
-0
ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+66
-12
No files found.
magic_pdf/pdf_parse_by_ocr.py
View file @
85587b25
...
...
@@ -22,20 +22,30 @@ from magic_pdf.pre_proc.detect_page_number import parse_pageNos
from
magic_pdf.pre_proc.ocr_cut_image
import
cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
merge_spans_to_line_by_layout
,
merge_spans_to_line_by_layout
,
merge_lines_to_block
,
)
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_spans_by_bboxes
,
remove_overlaps_min_spans
,
\
adjust_bbox_for_standalone_block
,
modify_y_axis
,
modify_inline_equation
adjust_bbox_for_standalone_block
,
modify_y_axis
,
modify_inline_equation
,
get_qa_need_list
,
\
remove_spans_by_bboxes_dict
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
def
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
):
def
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
):
return_dict
=
{
'preproc_blocks'
:
blocks
,
'layout_bboxes'
:
layout_bboxes
,
'page_idx'
:
page_id
,
'page_size'
:
[
page_w
,
page_h
],
'_layout_tree'
:
layout_tree
,
'images'
:
images
,
'tables'
:
tables
,
'interline_equations'
:
interline_equations
,
'inline_equations'
:
inline_equations
,
'dropped_text_block'
:
dropped_text_block
,
'dropped_image_block'
:
dropped_image_block
,
'dropped_table_block'
:
dropped_table_block
,
}
return
return_dict
...
...
@@ -79,7 +89,6 @@ def parse_pdf_by_ocr(
start_time
=
time
.
time
()
remove_bboxes
=
[]
end_page_id
=
end_page_id
if
end_page_id
else
len
(
pdf_docs
)
-
1
for
page_id
in
range
(
start_page_id
,
end_page_id
+
1
):
...
...
@@ -111,11 +120,19 @@ def parse_pdf_by_ocr(
)
# 构建需要remove的bbox列表
need_remove_spans_bboxes
=
[]
need_remove_spans_bboxes
.
extend
(
page_no_bboxes
)
need_remove_spans_bboxes
.
extend
(
header_bboxes
)
need_remove_spans_bboxes
.
extend
(
footer_bboxes
)
need_remove_spans_bboxes
.
extend
(
footnote_bboxes
)
# need_remove_spans_bboxes = []
# need_remove_spans_bboxes.extend(page_no_bboxes)
# need_remove_spans_bboxes.extend(header_bboxes)
# need_remove_spans_bboxes.extend(footer_bboxes)
# need_remove_spans_bboxes.extend(footnote_bboxes)
# 构建需要remove的bbox字典
need_remove_spans_bboxes_dict
=
{
"page_no"
:
page_no_bboxes
,
"header"
:
header_bboxes
,
"footer"
:
footer_bboxes
,
"footnote"
:
footnote_bboxes
,
}
layout_dets
=
ocr_page_info
[
"layout_dets"
]
spans
=
[]
...
...
@@ -177,7 +194,9 @@ def parse_pdf_by_ocr(
spans
=
remove_overlaps_min_spans
(
spans
)
# 删除remove_span_block_bboxes中的bbox
spans
=
remove_spans_by_bboxes
(
spans
,
need_remove_spans_bboxes
)
# spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
# 按qa要求,增加drop相关数据
spans
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
# 对image和table截图
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
)
...
...
@@ -202,18 +221,19 @@ def parse_pdf_by_ocr(
# 将spans合并成line(在layout内,从上到下,从左到右)
lines
=
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
)
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks
=
[]
for
line
in
lines
:
blocks
.
append
(
{
"bbox"
:
line
[
"bbox"
],
"lines"
:
[
line
],
}
)
# 将lines合并成block
blocks
=
merge_lines_to_block
(
lines
)
# 根据block合并段落
# 获取QA需要外置的list
images
,
tables
,
interline_equations
,
inline_equations
=
get_qa_need_list
(
blocks
)
# 构造pdf_info_dict
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
)
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
# 在测试时,保存调试信息
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
85587b25
...
...
@@ -80,6 +80,19 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
return
lines
def
merge_lines_to_block
(
lines
):
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks
=
[]
for
line
in
lines
:
blocks
.
append
(
{
"bbox"
:
line
[
"bbox"
],
"lines"
:
[
line
],
}
)
return
blocks
...
...
magic_pdf/pre_proc/ocr_span_list_modify.py
View file @
85587b25
from
loguru
import
logger
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
,
get_minbox_if_overlap_by_ratio
,
\
__is_overlaps_y_exceeds_threshold
...
...
@@ -31,6 +33,32 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
return
spans
def
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
):
dropped_text_block
=
[]
dropped_image_block
=
[]
dropped_table_block
=
[]
for
key
,
value
in
need_remove_spans_bboxes_dict
.
items
():
# logger.info(f"remove spans by bbox dict, key: {key}, value: {value}")
need_remove_spans
=
[]
for
span
in
spans
:
for
removed_bbox
in
value
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
removed_bbox
)
>
0.5
:
need_remove_spans
.
append
(
span
)
break
for
span
in
need_remove_spans
:
spans
.
remove
(
span
)
span
[
'tag'
]
=
key
if
span
[
'type'
]
in
[
'text'
,
'inline_equation'
,
'displayed_equation'
]:
dropped_text_block
.
append
(
span
)
elif
span
[
'type'
]
==
'image'
:
dropped_image_block
.
append
(
span
)
elif
span
[
'type'
]
==
'table'
:
dropped_table_block
.
append
(
span
)
return
spans
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
def
adjust_bbox_for_standalone_block
(
spans
):
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
for
sb_span
in
spans
:
...
...
@@ -46,7 +74,6 @@ def adjust_bbox_for_standalone_block(spans):
return
spans
def
modify_y_axis
(
spans
:
list
,
displayed_list
:
list
,
text_inline_lines
:
list
):
# displayed_list = []
...
...
@@ -105,8 +132,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
current_line
=
line
[
0
]
current_line
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
0
])
#调整每一个文字行内bbox统一
# 调整每一个文字行内bbox统一
for
line
in
text_inline_lines
:
current_line
,
(
line_first_y0
,
line_first_y
)
=
line
for
span
in
current_line
:
...
...
@@ -115,8 +141,9 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
# return spans, displayed_list, text_inline_lines
def
modify_inline_equation
(
spans
:
list
,
displayed_list
:
list
,
text_inline_lines
:
list
):
#错误行间公式转行内公式
#
错误行间公式转行内公式
j
=
0
for
i
in
range
(
len
(
displayed_list
)):
# if i == 8:
...
...
@@ -127,26 +154,53 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
while
j
<
len
(
text_inline_lines
):
text_line
=
text_inline_lines
[
j
]
y0
,
y1
=
text_line
[
1
]
if
(
span_y0
<
y0
and
span_y
>
y0
or
span_y0
<
y1
and
span_y
>
y1
or
span_y0
<
y0
and
span_y
>
y1
)
and
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
if
(
span_y0
<
y0
and
span_y
>
y0
or
span_y0
<
y1
and
span_y
>
y1
or
span_y0
<
y0
and
span_y
>
y1
)
and
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
#调整公式类型
#
调整公式类型
if
span
[
"type"
]
==
"displayed_equation"
:
#最后一行是行间公式
if
j
+
1
>=
len
(
text_inline_lines
):
#
最后一行是行间公式
if
j
+
1
>=
len
(
text_inline_lines
):
span
[
"type"
]
=
"inline_equation"
span
[
"bbox"
][
1
]
=
y0
span
[
"bbox"
][
3
]
=
y1
else
:
#行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
#
行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
y0_next
,
y1_next
=
text_inline_lines
[
j
+
1
][
1
]
if
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0_next
,
0
,
y1_next
))
and
3
*
(
y1
-
y0
)
>
span_y
-
span_y0
:
if
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0_next
,
0
,
y1_next
))
and
3
*
(
y1
-
y0
)
>
span_y
-
span_y0
:
span
[
"type"
]
=
"inline_equation"
span
[
"bbox"
][
1
]
=
y0
span
[
"bbox"
][
3
]
=
y1
break
elif
span_y
<
y0
or
span_y0
<
y0
and
span_y
>
y0
and
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
elif
span_y
<
y0
or
span_y0
<
y0
and
span_y
>
y0
and
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
break
else
:
j
+=
1
return
spans
\ No newline at end of file
return
spans
def
get_qa_need_list
(
blocks
):
# 创建 images, tables, interline_equations, inline_equations 的副本
images
=
[]
tables
=
[]
interline_equations
=
[]
inline_equations
=
[]
for
block
in
blocks
:
for
line
in
block
[
"lines"
]:
for
span
in
line
[
"spans"
]:
if
span
[
"type"
]
==
"image"
:
images
.
append
(
span
)
elif
span
[
"type"
]
==
"table"
:
tables
.
append
(
span
)
elif
span
[
"type"
]
==
"inline_equation"
:
inline_equations
.
append
(
span
)
elif
span
[
"type"
]
==
"displayed_equation"
:
interline_equations
.
append
(
span
)
else
:
continue
return
images
,
tables
,
interline_equations
,
inline_equations
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment