Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
a77cb36d
Unverified
Commit
a77cb36d
authored
Apr 22, 2024
by
myhloli
Committed by
GitHub
Apr 22, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #41 from myhloli/master
block type 字段名修复
parents
af84a8ac
45ce99bf
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
33 additions
and
11 deletions
+33
-11
drop_tag.py
magic_pdf/libs/drop_tag.py
+1
-0
pdf_parse_by_ocr_v2.py
magic_pdf/pdf_parse_by_ocr_v2.py
+0
-4
ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+25
-0
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+4
-4
ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+3
-3
No files found.
magic_pdf/libs/drop_tag.py
View file @
a77cb36d
...
@@ -16,3 +16,4 @@ class DropTag:
...
@@ -16,3 +16,4 @@ class DropTag:
FOOTNOTE
=
"footnote"
FOOTNOTE
=
"footnote"
NOT_IN_LAYOUT
=
"not_in_layout"
NOT_IN_LAYOUT
=
"not_in_layout"
SPAN_OVERLAP
=
"span_overlap"
SPAN_OVERLAP
=
"span_overlap"
BLOCK_OVERLAP
=
"block_overlap"
magic_pdf/pdf_parse_by_ocr_v2.py
View file @
a77cb36d
...
@@ -70,10 +70,6 @@ def parse_pdf_by_ocr(pdf_bytes,
...
@@ -70,10 +70,6 @@ def parse_pdf_by_ocr(pdf_bytes,
'''根据layout顺序,对当前页面所有需要留下的block进行排序'''
'''根据layout顺序,对当前页面所有需要留下的block进行排序'''
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
'''block嵌套问题解决'''
#@todo 1. text block大框套小框,删除小框 2. 图片或文本框与舍弃框重叠,优先信任舍弃框 3. 文本框与标题框重叠,优先信任文本框
'''获取所有需要拼接的span资源'''
'''获取所有需要拼接的span资源'''
spans
=
magic_model
.
get_all_spans
(
page_id
)
spans
=
magic_model
.
get_all_spans
(
page_id
)
'''删除重叠spans中较小的那些'''
'''删除重叠spans中较小的那些'''
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
a77cb36d
from
magic_pdf.libs.boxbase
import
get_minbox_if_overlap_by_ratio
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.ocr_content_type
import
BlockType
from
magic_pdf.libs.ocr_content_type
import
BlockType
...
@@ -31,5 +33,28 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
...
@@ -31,5 +33,28 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Footnote
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Footnote
,
None
,
None
,
None
,
None
])
'''block嵌套问题解决'''
# @todo 1. text block大框套小框,删除小框 2. 图片或文本框与舍弃框重叠,优先信任舍弃框 3. 文本框与标题框重叠,优先信任文本框
all_bboxes
,
dropped_blocks
=
remove_overlaps_min_blocks
(
all_bboxes
)
return
all_bboxes
return
all_bboxes
def
remove_overlaps_min_blocks
(
all_bboxes
):
dropped_blocks
=
[]
# 删除重叠blocks中较小的那些
for
block1
in
all_bboxes
.
copy
():
for
block2
in
all_bboxes
.
copy
():
if
block1
!=
block2
:
block1_box
=
block1
[
0
],
block1
[
1
],
block1
[
2
],
block1
[
3
]
block2_box
=
block2
[
0
],
block2
[
1
],
block2
[
2
],
block2
[
3
]
overlap_box
=
get_minbox_if_overlap_by_ratio
(
block1_box
,
block2_box
,
0.8
)
if
overlap_box
is
not
None
:
bbox_to_remove
=
next
(
(
block
for
block
in
all_bboxes
if
[
block
[
0
],
block
[
1
],
block
[
2
],
block
[
3
]]
==
overlap_box
),
None
)
if
bbox_to_remove
is
not
None
:
all_bboxes
.
remove
(
bbox_to_remove
)
bbox_to_remove
[
'tag'
]
=
DropTag
.
BLOCK_OVERLAP
dropped_blocks
.
append
(
bbox_to_remove
)
return
all_bboxes
,
dropped_blocks
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
a77cb36d
...
@@ -150,7 +150,7 @@ def fill_spans_in_blocks(blocks, spans):
...
@@ -150,7 +150,7 @@ def fill_spans_in_blocks(blocks, spans):
block_type
=
block
[
7
]
block_type
=
block
[
7
]
block_bbox
=
block
[
0
:
4
]
block_bbox
=
block
[
0
:
4
]
block_dict
=
{
block_dict
=
{
'
block_
type'
:
block_type
,
'type'
:
block_type
,
'bbox'
:
block_bbox
,
'bbox'
:
block_bbox
,
}
}
block_spans
=
[]
block_spans
=
[]
...
@@ -190,7 +190,7 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
...
@@ -190,7 +190,7 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
'''
'''
fix_blocks
=
[]
fix_blocks
=
[]
for
block
in
block_with_spans
:
for
block
in
block_with_spans
:
block_type
=
block
[
'
block_
type'
]
block_type
=
block
[
'type'
]
if
block_type
==
BlockType
.
Image
:
if
block_type
==
BlockType
.
Image
:
block
=
fix_image_block
(
block
,
img_blocks
)
block
=
fix_image_block
(
block
,
img_blocks
)
...
@@ -215,7 +215,7 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
...
@@ -215,7 +215,7 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
sort_block_lines
=
line_sort_spans_by_left_to_right
(
block_lines
)
sort_block_lines
=
line_sort_spans_by_left_to_right
(
block_lines
)
block
=
{
block
=
{
'bbox'
:
block_bbox
,
'bbox'
:
block_bbox
,
'
block_
type'
:
block_type
,
'type'
:
block_type
,
'lines'
:
sort_block_lines
'lines'
:
sort_block_lines
}
}
return
block
,
block_spans
return
block
,
block_spans
...
@@ -229,7 +229,7 @@ def make_body_block(span: dict, block_bbox: list, block_type: str):
...
@@ -229,7 +229,7 @@ def make_body_block(span: dict, block_bbox: list, block_type: str):
}
}
body_block
=
{
body_block
=
{
'bbox'
:
block_bbox
,
'bbox'
:
block_bbox
,
'
block_
type'
:
block_type
,
'type'
:
block_type
,
'lines'
:
[
body_line
]
'lines'
:
[
body_line
]
}
}
return
body_block
return
body_block
...
...
magic_pdf/pre_proc/ocr_span_list_modify.py
View file @
a77cb36d
...
@@ -222,10 +222,10 @@ def get_qa_need_list_v2(blocks):
...
@@ -222,10 +222,10 @@ def get_qa_need_list_v2(blocks):
interline_equations
=
[]
interline_equations
=
[]
for
block
in
blocks
:
for
block
in
blocks
:
if
block
[
"
block_
type"
]
==
BlockType
.
Image
:
if
block
[
"type"
]
==
BlockType
.
Image
:
images
.
append
(
block
)
images
.
append
(
block
)
elif
block
[
"
block_
type"
]
==
BlockType
.
Table
:
elif
block
[
"type"
]
==
BlockType
.
Table
:
tables
.
append
(
block
)
tables
.
append
(
block
)
elif
block
[
"
block_
type"
]
==
BlockType
.
InterlineEquation
:
elif
block
[
"type"
]
==
BlockType
.
InterlineEquation
:
interline_equations
.
append
(
block
)
interline_equations
.
append
(
block
)
return
images
,
tables
,
interline_equations
return
images
,
tables
,
interline_equations
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment