Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
9a377463
Commit
9a377463
authored
May 10, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix: remove high iou and low confidence blocks
parent
fec4372f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
36 additions
and
6 deletions
+36
-6
magic_model.py
magic_pdf/model/magic_model.py
+28
-3
pdf_parse_union_core.py
magic_pdf/pdf_parse_union_core.py
+8
-3
No files found.
magic_pdf/model/magic_model.py
View file @
9a377463
...
@@ -15,7 +15,7 @@ from magic_pdf.libs.boxbase import (
...
@@ -15,7 +15,7 @@ from magic_pdf.libs.boxbase import (
bbox_relative_pos
,
bbox_relative_pos
,
bbox_distance
,
bbox_distance
,
_is_part_overlap
,
_is_part_overlap
,
calculate_overlap_area_in_bbox1_area_ratio
,
calculate_overlap_area_in_bbox1_area_ratio
,
calculate_iou
,
)
)
from
magic_pdf.libs.ModelBlockTypeEnum
import
ModelBlockTypeEnum
from
magic_pdf.libs.ModelBlockTypeEnum
import
ModelBlockTypeEnum
...
@@ -51,7 +51,7 @@ class MagicModel:
...
@@ -51,7 +51,7 @@ class MagicModel:
for
need_remove
in
need_remove_list
:
for
need_remove
in
need_remove_list
:
layout_dets
.
remove
(
need_remove
)
layout_dets
.
remove
(
need_remove
)
def
__fix_by_confidence
(
self
):
def
__fix_by_
remove_low_
confidence
(
self
):
for
model_page_info
in
self
.
__model_list
:
for
model_page_info
in
self
.
__model_list
:
need_remove_list
=
[]
need_remove_list
=
[]
layout_dets
=
model_page_info
[
"layout_dets"
]
layout_dets
=
model_page_info
[
"layout_dets"
]
...
@@ -63,11 +63,36 @@ class MagicModel:
...
@@ -63,11 +63,36 @@ class MagicModel:
for
need_remove
in
need_remove_list
:
for
need_remove
in
need_remove_list
:
layout_dets
.
remove
(
need_remove
)
layout_dets
.
remove
(
need_remove
)
def
__fix_by_remove_high_iou_and_low_confidence
(
self
):
for
model_page_info
in
self
.
__model_list
:
need_remove_list
=
[]
layout_dets
=
model_page_info
[
"layout_dets"
]
for
layout_det1
in
layout_dets
:
for
layout_det2
in
layout_dets
:
if
layout_det1
==
layout_det2
:
continue
if
layout_det1
[
"category_id"
]
in
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
]
and
layout_det2
[
"category_id"
]
in
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
]:
if
calculate_iou
(
layout_det1
[
'bbox'
],
layout_det2
[
'bbox'
])
>
0.9
:
if
layout_det1
[
'score'
]
<
layout_det2
[
'score'
]:
layout_det_need_remove
=
layout_det1
else
:
layout_det_need_remove
=
layout_det2
if
layout_det_need_remove
not
in
need_remove_list
:
need_remove_list
.
append
(
layout_det_need_remove
)
else
:
continue
else
:
continue
for
need_remove
in
need_remove_list
:
layout_dets
.
remove
(
need_remove
)
def
__init__
(
self
,
model_list
:
list
,
docs
:
fitz
.
Document
):
def
__init__
(
self
,
model_list
:
list
,
docs
:
fitz
.
Document
):
self
.
__model_list
=
model_list
self
.
__model_list
=
model_list
self
.
__docs
=
docs
self
.
__docs
=
docs
self
.
__fix_axis
()
self
.
__fix_axis
()
self
.
__fix_by_confidence
()
self
.
__fix_by_remove_low_confidence
()
self
.
__fix_by_remove_high_iou_and_low_confidence
()
def
__reduct_overlap
(
self
,
bboxes
):
def
__reduct_overlap
(
self
,
bboxes
):
N
=
len
(
bboxes
)
N
=
len
(
bboxes
)
...
...
magic_pdf/pdf_parse_union_core.py
View file @
9a377463
...
@@ -129,9 +129,14 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
...
@@ -129,9 +129,14 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
spans
=
ocr_cut_image_and_table
(
spans
,
pdf_docs
[
page_id
],
page_id
,
pdf_bytes_md5
,
imageWriter
)
spans
=
ocr_cut_image_and_table
(
spans
,
pdf_docs
[
page_id
],
page_id
,
pdf_bytes_md5
,
imageWriter
)
'''将所有区块的bbox整理到一起'''
'''将所有区块的bbox整理到一起'''
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split
(
if
len
(
interline_equation_blocks
)
>
0
:
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split
(
interline_equations
,
page_w
,
page_h
)
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
)
else
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equations
,
page_w
,
page_h
)
'''先处理不需要排版的discarded_blocks'''
'''先处理不需要排版的discarded_blocks'''
discarded_block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_discarded_blocks
,
spans
,
0.4
)
discarded_block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_discarded_blocks
,
spans
,
0.4
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment