Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
099f19f2
Unverified
Commit
099f19f2
authored
Nov 01, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 01, 2024
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #834 from myhloli/dev
feat(pdf_parse): improve span filtering and add new block types
parents
73afb7d6
149132d6
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
49 additions
and
36 deletions
+49
-36
output_file_en_us.md
docs/output_file_en_us.md
+3
-0
output_file_zh_cn.md
docs/output_file_zh_cn.md
+8
-5
draw_bbox.py
magic_pdf/libs/draw_bbox.py
+2
-1
version.py
magic_pdf/libs/version.py
+1
-1
pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+35
-29
No files found.
docs/output_file_en_us.md
View file @
099f19f2
...
@@ -175,11 +175,14 @@ Detailed explanation of second-level block types
...
@@ -175,11 +175,14 @@ Detailed explanation of second-level block types
| :----------------- | :--------------------- |
| :----------------- | :--------------------- |
| image_body | Main body of the image |
| image_body | Main body of the image |
| image_caption | Image description text |
| image_caption | Image description text |
| image_footnote | Image footnote |
| table_body | Main body of the table |
| table_body | Main body of the table |
| table_caption | Table description text |
| table_caption | Table description text |
| table_footnote | Table footnote |
| table_footnote | Table footnote |
| text | Text block |
| text | Text block |
| title | Title block |
| title | Title block |
| index | Index block |
| list | List block |
| interline_equation | Block formula |
| interline_equation | Block formula |
<br>
<br>
...
...
docs/output_file_zh_cn.md
View file @
099f19f2
...
@@ -174,11 +174,14 @@ poly 坐标的格式 \[x0, y0, x1, y1, x2, y2, x3, y3\], 分别表示左上、
...
@@ -174,11 +174,14 @@ poly 坐标的格式 \[x0, y0, x1, y1, x2, y2, x3, y3\], 分别表示左上、
| :----------------- | :------------- |
| :----------------- | :------------- |
| image_body | 图像的本体 |
| image_body | 图像的本体 |
| image_caption | 图像的描述文本 |
| image_caption | 图像的描述文本 |
| image_footnote | 图像的脚注 |
| table_body | 表格本体 |
| table_body | 表格本体 |
| table_caption | 表格的描述文本 |
| table_caption | 表格的描述文本 |
| table_footnote | 表格的脚注 |
| table_footnote | 表格的脚注 |
| text | 文本块 |
| text | 文本块 |
| title | 标题块 |
| title | 标题块 |
| index | 目录块 |
| list | 列表块 |
| interline_equation | 行间公式块 |
| interline_equation | 行间公式块 |
<br>
<br>
...
...
magic_pdf/libs/draw_bbox.py
View file @
099f19f2
...
@@ -249,7 +249,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
...
@@ -249,7 +249,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
page_dropped_list
.
append
(
span
[
'bbox'
])
page_dropped_list
.
append
(
span
[
'bbox'
])
dropped_list
.
append
(
page_dropped_list
)
dropped_list
.
append
(
page_dropped_list
)
# 构造其余useful_list
# 构造其余useful_list
for
block
in
page
[
'para_blocks'
]:
# for block in page['para_blocks']: # span直接用分段合并前的结果就可以
for
block
in
page
[
'preproc_blocks'
]:
if
block
[
'type'
]
in
[
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
Title
,
...
...
magic_pdf/libs/version.py
View file @
099f19f2
__version__
=
"0.
8
.0"
__version__
=
"0.
9
.0"
magic_pdf/pdf_parse_union_core_v2.py
View file @
099f19f2
...
@@ -382,39 +382,44 @@ def revert_group_blocks(blocks):
...
@@ -382,39 +382,44 @@ def revert_group_blocks(blocks):
return
new_blocks
return
new_blocks
def
remove_outside_spans
(
spans
,
all_bboxes
):
def
remove_outside_spans
(
spans
,
all_bboxes
,
all_discarded_blocks
):
image_bboxes
=
[]
def
get_block_bboxes
(
blocks
,
block_type_list
):
table_bboxes
=
[
]
return
[
block
[
0
:
4
]
for
block
in
blocks
if
block
[
7
]
in
block_type_list
]
other_block_bboxes
=
[]
for
block
in
all_bboxes
:
image_bboxes
=
get_block_bboxes
(
all_bboxes
,
[
BlockType
.
ImageBody
])
block_type
=
block
[
7
]
table_bboxes
=
get_block_bboxes
(
all_bboxes
,
[
BlockType
.
TableBody
])
block_bbox
=
block
[
0
:
4
]
other_block_type
=
[
]
for
block_type
in
BlockType
.
__dict__
.
values
():
if
block_type
==
BlockType
.
ImageBody
:
if
not
isinstance
(
block_type
,
str
)
:
image_bboxes
.
append
(
block_bbox
)
continue
elif
block_type
==
BlockType
.
TableBody
:
if
block_type
not
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
]
:
table_bboxes
.
append
(
block_bbox
)
other_block_type
.
append
(
block_type
)
else
:
other_block_bboxes
=
get_block_bboxes
(
all_bboxes
,
other_block_type
)
other_block_bboxes
.
append
(
block_bbox
)
discarded_block_bboxes
=
get_block_bboxes
(
all_discarded_blocks
,
[
BlockType
.
Discarded
]
)
new_spans
=
[]
new_spans
=
[]
for
span
in
spans
:
for
span
in
spans
:
if
span
[
'type'
]
==
ContentType
.
Image
:
span_bbox
=
span
[
'bbox'
]
for
block_bbox
in
image_bboxes
:
span_type
=
span
[
'type'
]
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
block_bbox
)
>
0.5
:
if
any
(
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.4
for
block_bbox
in
discarded_block_bboxes
):
new_spans
.
append
(
span
)
continue
if
span_type
==
ContentType
.
Image
:
if
any
(
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.5
for
block_bbox
in
image_bboxes
):
new_spans
.
append
(
span
)
new_spans
.
append
(
span
)
break
elif
span_type
==
ContentType
.
Table
:
elif
span
[
'type'
]
==
ContentType
.
Table
:
if
any
(
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.5
for
block_bbox
in
for
block_bbox
in
table_bboxes
:
table_bboxes
):
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
block_bbox
)
>
0.5
:
new_spans
.
append
(
span
)
new_spans
.
append
(
span
)
break
else
:
else
:
for
block_bbox
in
other_block_bboxes
:
if
any
(
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.5
for
block_bbox
in
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
block_bbox
)
>
0.5
:
other_block_bboxes
)
:
new_spans
.
append
(
span
)
new_spans
.
append
(
span
)
break
return
new_spans
return
new_spans
...
@@ -488,7 +493,8 @@ def parse_page_core(
...
@@ -488,7 +493,8 @@ def parse_page_core(
raise
Exception
(
'parse_mode must be txt or ocr'
)
raise
Exception
(
'parse_mode must be txt or ocr'
)
"""在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
"""在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
spans
=
remove_outside_spans
(
spans
,
all_bboxes
)
"""顺便删除大水印并保留abandon的span"""
spans
=
remove_outside_spans
(
spans
,
all_bboxes
,
all_discarded_blocks
)
"""删除重叠spans中置信度较低的那些"""
"""删除重叠spans中置信度较低的那些"""
spans
,
dropped_spans_by_confidence
=
remove_overlaps_low_confidence_spans
(
spans
)
spans
,
dropped_spans_by_confidence
=
remove_overlaps_low_confidence_spans
(
spans
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment