Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
2f13b3a8
Commit
2f13b3a8
authored
May 10, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add new drop scene
parent
a6013e63
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
7 additions
and
3 deletions
+7
-3
drop_reason.py
magic_pdf/libs/drop_reason.py
+1
-0
pdf_parse_union_core.py
magic_pdf/pdf_parse_union_core.py
+5
-2
ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+1
-1
No files found.
magic_pdf/libs/drop_reason.py
View file @
2f13b3a8
...
@@ -23,4 +23,5 @@ class DropReason:
...
@@ -23,4 +23,5 @@ class DropReason:
PSEUDO_SINGLE_COLUMN
=
"pseudo_single_column"
# 无法精确判断文字分栏
PSEUDO_SINGLE_COLUMN
=
"pseudo_single_column"
# 无法精确判断文字分栏
CAN_NOT_DETECT_PAGE_LAYOUT
=
"can_not_detect_page_layout"
# 无法分析页面的版面
CAN_NOT_DETECT_PAGE_LAYOUT
=
"can_not_detect_page_layout"
# 无法分析页面的版面
NEGATIVE_BBOX_AREA
=
"negative_bbox_area"
# 缩放导致 bbox 面积为负
NEGATIVE_BBOX_AREA
=
"negative_bbox_area"
# 缩放导致 bbox 面积为负
OVERLAP_BLOCKS_CAN_NOT_SEPARATION
=
"overlap_blocks_can_t_separation"
# 无法分离重叠的block
\ No newline at end of file
magic_pdf/pdf_parse_union_core.py
View file @
2f13b3a8
...
@@ -130,13 +130,16 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
...
@@ -130,13 +130,16 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
'''将所有区块的bbox整理到一起'''
'''将所有区块的bbox整理到一起'''
if
len
(
interline_equation_blocks
)
>
0
:
if
len
(
interline_equation_blocks
)
>
0
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split
(
all_bboxes
,
all_discarded_blocks
,
drop_reasons
=
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
)
interline_equation_blocks
,
page_w
,
page_h
)
else
:
else
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split
(
all_bboxes
,
all_discarded_blocks
,
drop_reasons
=
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equations
,
page_w
,
page_h
)
interline_equations
,
page_w
,
page_h
)
if
len
(
drop_reasons
)
>
0
:
need_drop
=
True
drop_reason
=
DropReason
.
OVERLAP_BLOCKS_CAN_NOT_SEPARATION
'''先处理不需要排版的discarded_blocks'''
'''先处理不需要排版的discarded_blocks'''
discarded_block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_discarded_blocks
,
spans
,
0.4
)
discarded_block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_discarded_blocks
,
spans
,
0.4
)
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
2f13b3a8
...
@@ -51,7 +51,7 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
...
@@ -51,7 +51,7 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes
,
drop_reasons
=
remove_overlap_between_bbox_for_block
(
all_bboxes
)
all_bboxes
,
drop_reasons
=
remove_overlap_between_bbox_for_block
(
all_bboxes
)
return
all_bboxes
,
all_discarded_blocks
return
all_bboxes
,
all_discarded_blocks
,
drop_reasons
def
fix_text_overlap_title_blocks
(
all_bboxes
):
def
fix_text_overlap_title_blocks
(
all_bboxes
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment