Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
5b9fa871
Commit
5b9fa871
authored
Mar 05, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/master'
parents
71a042d9
bc339320
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
45 additions
and
2 deletions
+45
-2
pdf_parse_by_model.py
magic_pdf/pdf_parse_by_model.py
+5
-0
detect_equation.py
magic_pdf/pre_proc/detect_equation.py
+11
-2
solve_line_alien.py
magic_pdf/pre_proc/solve_line_alien.py
+29
-0
No files found.
magic_pdf/pdf_parse_by_model.py
View file @
5b9fa871
...
...
@@ -59,6 +59,7 @@ from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
from
magic_pdf.pre_proc.remove_rotate_bbox
import
get_side_boundry
,
remove_rotate_side_textblock
,
remove_side_blank_block
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_text_block_horizontal_overlap
,
resolve_bbox_overlap_conflict
from
magic_pdf.pre_proc.fix_table
import
fix_table_text_block
,
fix_tables
,
include_table_title
from
magic_pdf.pre_proc.solve_line_alien
import
solve_inline_too_large_interval
denseSingleLineBlockException_msg
=
DenseSingleLineBlockException
()
.
message
titleDetectionException_msg
=
TitleDetectionException
()
.
message
...
...
@@ -446,6 +447,10 @@ def parse_pdf_by_model(
==================================================================================================================================
进入段落处理-2阶段
"""
# 处理行内文字间距较大问题
pdf_info_dict
=
solve_inline_too_large_interval
(
pdf_info_dict
)
start_time
=
time
.
time
()
para_process_pipeline
=
ParaProcessPipeline
()
...
...
magic_pdf/pre_proc/detect_equation.py
View file @
5b9fa871
from
magic_pdf.libs.boxbase
import
_is_in
# 正则
from
magic_pdf.libs.boxbase
import
_is_in
,
calculate_overlap_area_2_minbox_area_ratio
# 正则
from
magic_pdf.libs.commons
import
fitz
# pyMuPDF库
...
...
@@ -18,7 +18,16 @@ def __solve_contain_bboxs(all_bbox_list: list):
dump_list
.
append
(
all_bbox_list
[
i
])
elif
_is_in
(
bbox2
,
bbox1
):
dump_list
.
append
(
all_bbox_list
[
j
])
else
:
ratio
=
calculate_overlap_area_2_minbox_area_ratio
(
bbox1
,
bbox2
)
if
ratio
>
0.7
:
s1
=
(
bbox1
[
2
]
-
bbox1
[
0
])
*
(
bbox1
[
3
]
-
bbox1
[
1
])
s2
=
(
bbox2
[
2
]
-
bbox2
[
0
])
*
(
bbox2
[
3
]
-
bbox2
[
1
])
if
s2
>
s1
:
dump_list
.
append
(
all_bbox_list
[
i
])
else
:
dump_list
.
append
(
all_bbox_list
[
i
])
# 遍历需要删除的列表中的每个元素
for
item
in
dump_list
:
...
...
magic_pdf/pre_proc/solve_line_alien.py
0 → 100644
View file @
5b9fa871
def
solve_inline_too_large_interval
(
pdf_info_dict
:
dict
)
->
dict
:
# text_block -> json中的preproc_block
"""解决行内文本间距过大问题"""
for
i
in
range
(
len
(
pdf_info_dict
)):
text_blocks
=
pdf_info_dict
[
f
'page_{i}'
][
'preproc_blocks'
]
for
block
in
text_blocks
:
x_pre_1
,
y_pre_1
,
x_pre_2
,
y_pre_2
=
0
,
0
,
0
,
0
for
line
in
block
[
'lines'
]:
x_cur_1
,
y_cur_1
,
x_cur_2
,
y_cur_2
=
line
[
'bbox'
]
# line_box = [x1, y1, x2, y2]
if
int
(
y_cur_1
)
==
int
(
y_pre_1
)
and
int
(
y_cur_2
)
==
int
(
y_pre_2
):
# if len(line['spans']) == 1:
line
[
'spans'
][
0
][
'text'
]
=
' '
+
line
[
'spans'
][
0
][
'text'
]
x_pre_1
,
y_pre_1
,
x_pre_2
,
y_pre_2
=
line
[
'bbox'
]
return
pdf_info_dict
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment