Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
5eab010b
Commit
5eab010b
authored
Mar 18, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ocr模式对所有drop的span记录tag并分类
parent
f5bfaaf6
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
62 additions
and
25 deletions
+62
-25
drop_tag.py
magic_pdf/libs/drop_tag.py
+9
-1
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+34
-9
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+8
-2
ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+11
-13
No files found.
magic_pdf/libs/drop_tag.py
View file @
5eab010b
COLOR_BG_HEADER_TXT_BLOCK
=
"color_background_header_txt_block"
class
DropTag
:
PAGE_NUMBER
=
"page_no"
HEADER
=
"header"
FOOTER
=
"footer"
FOOTNOTE
=
"footnote"
NOT_IN_LAYOUT
=
"not_in_layout"
SPAN_OVERLAP
=
"span_overlap"
magic_pdf/pdf_parse_by_ocr.py
View file @
5eab010b
...
...
@@ -14,6 +14,7 @@ from magic_pdf.libs.commons import (
get_docx_model_output
,
)
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.para.para_split
import
para_split
...
...
@@ -34,7 +35,7 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
def
construct_page_component
(
blocks
,
para_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
need_remove_spans_bboxes_dict
):
return_dict
=
{
'preproc_blocks'
:
blocks
,
...
...
@@ -50,6 +51,7 @@ def construct_page_component(blocks, para_blocks, layout_bboxes, page_id, page_w
'droped_text_block'
:
dropped_text_block
,
'droped_image_block'
:
dropped_image_block
,
'droped_table_block'
:
dropped_table_block
,
'dropped_equation_block'
:
dropped_equation_block
,
'droped_bboxes'
:
need_remove_spans_bboxes_dict
,
}
return
return_dict
...
...
@@ -133,10 +135,10 @@ def parse_pdf_by_ocr(
# 构建需要remove的bbox字典
need_remove_spans_bboxes_dict
=
{
"page_no"
:
page_no_bboxes
,
"header"
:
header_bboxes
,
"footer"
:
footer_bboxes
,
"footnote"
:
footnote_bboxes
,
DropTag
.
PAGE_NUMBER
:
page_no_bboxes
,
DropTag
.
HEADER
:
header_bboxes
,
DropTag
.
FOOTER
:
footer_bboxes
,
DropTag
.
FOOTNOTE
:
footnote_bboxes
,
}
layout_dets
=
ocr_page_info
[
"layout_dets"
]
...
...
@@ -202,12 +204,12 @@ def parse_pdf_by_ocr(
# 删除重叠spans中较小的那些
spans
=
remove_overlaps_min_spans
(
spans
)
spans
,
dropped_spans_by_span_overlap
=
remove_overlaps_min_spans
(
spans
)
# 删除remove_span_block_bboxes中的bbox
# spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
# 按qa要求,增加drop相关数据
spans
,
dropped_
text_block
,
dropped_image_block
,
dropped_table_block
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
spans
,
dropped_
spans_by_removed_bboxes
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
# 对image和table截图
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
)
...
...
@@ -230,7 +232,7 @@ def parse_pdf_by_ocr(
layout_bboxes
,
layout_tree
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
],
page
,
ocr_page_info
)
# 将spans合并成line(在layout内,从上到下,从左到右)
lines
=
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
)
lines
,
dropped_spans_by_layout
=
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
)
# 将lines合并成block
blocks
=
merge_lines_to_block
(
lines
)
...
...
@@ -241,10 +243,33 @@ def parse_pdf_by_ocr(
# 获取QA需要外置的list
images
,
tables
,
interline_equations
,
inline_equations
=
get_qa_need_list
(
blocks
)
# drop的span_list合并
dropped_spans
=
[]
dropped_spans
.
extend
(
dropped_spans_by_span_overlap
)
dropped_spans
.
extend
(
dropped_spans_by_removed_bboxes
)
dropped_spans
.
extend
(
dropped_spans_by_layout
)
dropped_text_block
=
[]
dropped_image_block
=
[]
dropped_table_block
=
[]
dropped_equation_block
=
[]
for
span
in
dropped_spans
:
# drop出的spans进行分类
if
span
[
'type'
]
==
ContentType
.
Text
:
dropped_text_block
.
append
(
span
)
elif
span
[
'type'
]
==
ContentType
.
Image
:
dropped_image_block
.
append
(
span
)
elif
span
[
'type'
]
==
ContentType
.
Table
:
dropped_table_block
.
append
(
span
)
elif
span
[
'type'
]
in
[
ContentType
.
InlineEquation
,
ContentType
.
InterlineEquation
]:
dropped_equation_block
.
append
(
span
)
# 构造pdf_info_dict
page_info
=
construct_page_component
(
blocks
,
para_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
dropped_equation_block
,
need_remove_spans_bboxes_dict
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
5eab010b
...
...
@@ -2,6 +2,7 @@ from loguru import logger
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
,
get_minbox_if_overlap_by_ratio
,
\
calculate_overlap_area_in_bbox1_area_ratio
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.ocr_content_type
import
ContentType
...
...
@@ -59,6 +60,7 @@ def merge_spans_to_line(spans):
def
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
):
lines
=
[]
new_spans
=
[]
dropped_spans
=
[]
for
item
in
layout_bboxes
:
layout_bbox
=
item
[
'layout_bbox'
]
# 遍历spans,将每个span放入对应的layout中
...
...
@@ -78,10 +80,14 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
layout_lines
=
merge_spans_to_line
(
layout_sapns
)
lines
.
extend
(
layout_lines
)
#对line中的span进行排序
#
对line中的span进行排序
lines
=
line_sort_spans_by_left_to_right
(
lines
)
return
lines
for
span
in
spans
:
span
[
'tag'
]
=
DropTag
.
NOT_IN_LAYOUT
dropped_spans
.
append
(
span
)
return
lines
,
dropped_spans
def
merge_lines_to_block
(
lines
):
...
...
magic_pdf/pre_proc/ocr_span_list_modify.py
View file @
5eab010b
...
...
@@ -2,10 +2,12 @@ from loguru import logger
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
,
get_minbox_if_overlap_by_ratio
,
\
__is_overlaps_y_exceeds_threshold
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.ocr_content_type
import
ContentType
def
remove_overlaps_min_spans
(
spans
):
dropped_spans
=
[]
# 删除重叠spans中较小的那些
for
span1
in
spans
.
copy
():
for
span2
in
spans
.
copy
():
...
...
@@ -15,7 +17,9 @@ def remove_overlaps_min_spans(spans):
bbox_to_remove
=
next
((
span
for
span
in
spans
if
span
[
'bbox'
]
==
overlap_box
),
None
)
if
bbox_to_remove
is
not
None
:
spans
.
remove
(
bbox_to_remove
)
return
spans
bbox_to_remove
[
'tag'
]
=
DropTag
.
SPAN_OVERLAP
dropped_spans
.
append
(
bbox_to_remove
)
return
spans
,
dropped_spans
def
remove_spans_by_bboxes
(
spans
,
need_remove_spans_bboxes
):
...
...
@@ -35,9 +39,7 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
def
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
):
dropped_text_block
=
[]
dropped_image_block
=
[]
dropped_table_block
=
[]
dropped_spans
=
[]
for
drop_tag
,
removed_bboxes
in
need_remove_spans_bboxes_dict
.
items
():
# logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
need_remove_spans
=
[]
...
...
@@ -50,14 +52,9 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
for
span
in
need_remove_spans
:
spans
.
remove
(
span
)
span
[
'tag'
]
=
drop_tag
if
span
[
'type'
]
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
,
ContentType
.
InterlineEquation
]:
dropped_text_block
.
append
(
span
)
elif
span
[
'type'
]
==
ContentType
.
Image
:
dropped_image_block
.
append
(
span
)
elif
span
[
'type'
]
==
ContentType
.
Table
:
dropped_table_block
.
append
(
span
)
dropped_spans
.
append
(
span
)
return
spans
,
dropped_
text_block
,
dropped_image_block
,
dropped_table_block
return
spans
,
dropped_
spans
def
adjust_bbox_for_standalone_block
(
spans
):
...
...
@@ -98,7 +95,8 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if
span
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
or
any
(
s
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
for
s
in
current_line
):
s
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]
for
s
in
current_line
):
# 传入
if
span
[
"type"
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]:
displayed_list
.
append
(
span
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment