Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
0fb9619a
Unverified
Commit
0fb9619a
authored
May 07, 2024
by
Kaiwen Liu
Committed by
GitHub
May 07, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'magicpdf:master' into master
parents
8c3a37ff
eebd9767
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
67 additions
and
12 deletions
+67
-12
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+2
-3
pdf_parse_union_core.py
magic_pdf/pdf_parse_union_core.py
+1
-2
ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+3
-0
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+2
-2
remove_bbox_overlap.py
magic_pdf/pre_proc/remove_bbox_overlap.py
+59
-5
No files found.
magic_pdf/pdf_parse_by_ocr.py
View file @
0fb9619a
...
@@ -24,7 +24,7 @@ from magic_pdf.pre_proc.ocr_dict_merge import (
...
@@ -24,7 +24,7 @@ from magic_pdf.pre_proc.ocr_dict_merge import (
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_spans_by_bboxes
,
remove_overlaps_min_spans
,
\
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_spans_by_bboxes
,
remove_overlaps_min_spans
,
\
adjust_bbox_for_standalone_block
,
modify_y_axis
,
modify_inline_equation
,
get_qa_need_list
,
\
adjust_bbox_for_standalone_block
,
modify_y_axis
,
modify_inline_equation
,
get_qa_need_list
,
\
remove_spans_by_bboxes_dict
remove_spans_by_bboxes_dict
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
_for_span
def
parse_pdf_by_ocr
(
def
parse_pdf_by_ocr
(
...
@@ -158,8 +158,7 @@ def parse_pdf_by_ocr(
...
@@ -158,8 +158,7 @@ def parse_pdf_by_ocr(
spans
=
modify_inline_equation
(
spans
,
displayed_list
,
text_inline_lines
)
spans
=
modify_inline_equation
(
spans
,
displayed_list
,
text_inline_lines
)
'''bbox去除粘连'''
'''bbox去除粘连'''
spans
=
remove_overlap_between_bbox
(
spans
)
spans
=
remove_overlap_between_bbox_for_span
(
spans
)
'''
'''
对tpye=["interline_equation", "image", "table"]进行额外处理,
对tpye=["interline_equation", "image", "table"]进行额外处理,
如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
...
...
magic_pdf/pdf_parse_union_core.py
View file @
0fb9619a
...
@@ -20,7 +20,6 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
...
@@ -20,7 +20,6 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
,
\
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
,
\
fix_discarded_block
fix_discarded_block
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
...
@@ -98,7 +97,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
...
@@ -98,7 +97,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
img_blocks
=
magic_model
.
get_imgs
(
page_id
)
img_blocks
=
magic_model
.
get_imgs
(
page_id
)
table_blocks
=
magic_model
.
get_tables
(
page_id
)
table_blocks
=
magic_model
.
get_tables
(
page_id
)
discarded_blocks
=
magic_model
.
get_discarded
(
page_id
)
discarded_blocks
=
magic_model
.
get_discarded
(
page_id
)
text_blocks
=
remove_overlap_between_bbox
(
magic_model
.
get_text_blocks
(
page_id
)
)
text_blocks
=
magic_model
.
get_text_blocks
(
page_id
)
title_blocks
=
magic_model
.
get_title_blocks
(
page_id
)
title_blocks
=
magic_model
.
get_title_blocks
(
page_id
)
inline_equations
,
interline_equations
,
interline_equation_blocks
=
magic_model
.
get_equations
(
page_id
)
inline_equations
,
interline_equations
,
interline_equation_blocks
=
magic_model
.
get_equations
(
page_id
)
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
0fb9619a
...
@@ -2,6 +2,7 @@ from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_ove
...
@@ -2,6 +2,7 @@ from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_ove
calculate_iou
calculate_iou
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.ocr_content_type
import
BlockType
from
magic_pdf.libs.ocr_content_type
import
BlockType
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox_for_block
def
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
def
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
...
@@ -35,6 +36,8 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
...
@@ -35,6 +36,8 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
all_bboxes
=
remove_need_drop_blocks
(
all_bboxes
,
discarded_blocks
)
all_bboxes
=
remove_need_drop_blocks
(
all_bboxes
,
discarded_blocks
)
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes
=
remove_overlaps_min_blocks
(
all_bboxes
)
all_bboxes
=
remove_overlaps_min_blocks
(
all_bboxes
)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes
=
remove_overlap_between_bbox_for_block
(
all_bboxes
)
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50
%
区域的(限定footnote)'''
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50
%
区域的(限定footnote)'''
for
discarded
in
discarded_blocks
:
for
discarded
in
discarded_blocks
:
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
0fb9619a
...
@@ -5,7 +5,7 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox
...
@@ -5,7 +5,7 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.pre_proc.ocr_span_list_modify
import
modify_y_axis
,
modify_inline_equation
from
magic_pdf.pre_proc.ocr_span_list_modify
import
modify_y_axis
,
modify_inline_equation
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
_for_span
# 将每一个line中的span从左到右排序
# 将每一个line中的span从左到右排序
...
@@ -168,7 +168,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
...
@@ -168,7 +168,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
block_spans
=
modify_inline_equation
(
block_spans
,
displayed_list
,
text_inline_lines
)
block_spans
=
modify_inline_equation
(
block_spans
,
displayed_list
,
text_inline_lines
)
'''bbox去除粘连'''
# 去粘连会影响span的bbox,导致后续fill的时候出错
'''bbox去除粘连'''
# 去粘连会影响span的bbox,导致后续fill的时候出错
# block_spans = remove_overlap_between_bbox(block_spans)
# block_spans = remove_overlap_between_bbox
_for_span
(block_spans)
block_dict
[
'spans'
]
=
block_spans
block_dict
[
'spans'
]
=
block_spans
block_with_spans
.
append
(
block_dict
)
block_with_spans
.
append
(
block_dict
)
...
...
magic_pdf/pre_proc/remove_bbox_overlap.py
View file @
0fb9619a
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap
,
_is_in
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap
,
_is_in
def
_remove_overlap_between_bbox
(
spans
):
def
_remove_overlap_between_bbox
_for_span
(
spans
):
res
=
[]
res
=
[]
keeps
=
[
True
]
*
len
(
spans
)
keeps
=
[
True
]
*
len
(
spans
)
...
@@ -17,7 +17,7 @@ def _remove_overlap_between_bbox(spans):
...
@@ -17,7 +17,7 @@ def _remove_overlap_between_bbox(spans):
continue
continue
for
i
in
range
(
len
(
res
)):
for
i
in
range
(
len
(
res
)):
if
_is_in
(
v
[
"bbox"
],
res
[
i
][
"bbox"
]):
if
_is_in
(
v
[
"bbox"
],
res
[
i
][
"bbox"
]):
continue
continue
if
_is_in_or_part_overlap
(
res
[
i
][
"bbox"
],
v
[
"bbox"
]):
if
_is_in_or_part_overlap
(
res
[
i
][
"bbox"
],
v
[
"bbox"
]):
ix0
,
iy0
,
ix1
,
iy1
=
res
[
i
][
"bbox"
]
ix0
,
iy0
,
ix1
,
iy1
=
res
[
i
][
"bbox"
]
...
@@ -34,7 +34,7 @@ def _remove_overlap_between_bbox(spans):
...
@@ -34,7 +34,7 @@ def _remove_overlap_between_bbox(spans):
else
:
else
:
mid
=
(
ix0
+
x1
)
//
2
mid
=
(
ix0
+
x1
)
//
2
ix0
=
max
(
mid
+
0.25
,
ix0
)
ix0
=
max
(
mid
+
0.25
,
ix0
)
x1
=
min
(
mid
-
0.25
,
x1
)
x1
=
min
(
mid
-
0.25
,
x1
)
else
:
else
:
if
y1
>=
iy1
:
if
y1
>=
iy1
:
mid
=
(
y0
+
iy1
)
//
2
mid
=
(
y0
+
iy1
)
//
2
...
@@ -51,5 +51,59 @@ def _remove_overlap_between_bbox(spans):
...
@@ -51,5 +51,59 @@ def _remove_overlap_between_bbox(spans):
return
res
return
res
def
remove_overlap_between_bbox
(
spans
):
def
_remove_overlap_between_bbox_for_block
(
all_bboxes
):
return
_remove_overlap_between_bbox
(
spans
)
res
=
[]
keeps
=
[
True
]
*
len
(
all_bboxes
)
for
i
in
range
(
len
(
all_bboxes
)):
for
j
in
range
(
len
(
all_bboxes
)):
if
i
==
j
:
continue
if
_is_in
(
all_bboxes
[
i
][:
4
],
all_bboxes
[
j
][:
4
]):
keeps
[
i
]
=
False
for
idx
,
v
in
enumerate
(
all_bboxes
):
if
not
keeps
[
idx
]:
continue
for
i
in
range
(
len
(
res
)):
if
_is_in
(
v
[:
4
],
res
[
i
][:
4
]):
continue
if
_is_in_or_part_overlap
(
res
[
i
][:
4
],
v
[:
4
]):
ix0
,
iy0
,
ix1
,
iy1
=
res
[
i
][:
4
]
x0
,
y0
,
x1
,
y1
=
v
[:
4
]
diff_x
=
min
(
x1
,
ix1
)
-
max
(
x0
,
ix0
)
diff_y
=
min
(
y1
,
iy1
)
-
max
(
y0
,
iy0
)
if
diff_y
>
diff_x
:
if
x1
>=
ix1
:
mid
=
(
x0
+
ix1
)
//
2
ix1
=
min
(
mid
-
0.25
,
ix1
)
x0
=
max
(
mid
+
0.25
,
x0
)
else
:
mid
=
(
ix0
+
x1
)
//
2
ix0
=
max
(
mid
+
0.25
,
ix0
)
x1
=
min
(
mid
-
0.25
,
x1
)
else
:
if
y1
>=
iy1
:
mid
=
(
y0
+
iy1
)
//
2
y0
=
max
(
mid
+
0.25
,
y0
)
iy1
=
min
(
iy1
,
mid
-
0.25
)
else
:
mid
=
(
iy0
+
y1
)
//
2
y1
=
min
(
y1
,
mid
-
0.25
)
iy0
=
max
(
mid
+
0.25
,
iy0
)
res
[
i
][:
4
]
=
[
ix0
,
iy0
,
ix1
,
iy1
]
v
[:
4
]
=
[
x0
,
y0
,
x1
,
y1
]
res
.
append
(
v
)
return
res
def
remove_overlap_between_bbox_for_span
(
spans
):
return
_remove_overlap_between_bbox_for_span
(
spans
)
def
remove_overlap_between_bbox_for_block
(
all_bboxes
):
return
_remove_overlap_between_bbox_for_block
(
all_bboxes
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment