Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
2e32ecfe
Commit
2e32ecfe
authored
Mar 12, 2024
by
liukaiwen
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of github.com:myhloli/Magic-PDF
# Conflicts: # demo/draw_bbox.py
parents
1d59509d
f31117de
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
147 additions
and
64 deletions
+147
-64
draw_bbox.py
demo/draw_bbox.py
+62
-17
ocr_demo.py
demo/ocr_demo.py
+12
-5
ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+28
-0
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+9
-11
ocr_cut_image.py
magic_pdf/pre_proc/ocr_cut_image.py
+2
-2
ocr_detect_layout.py
magic_pdf/pre_proc/ocr_detect_layout.py
+34
-29
No files found.
demo/draw_bbox.py
View file @
2e32ecfe
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
# PDF文件路径
def
draw_bbox
(
i
,
bbox_list
,
page
,
rgb_config
):
pdf_path
=
r"D:\projects\Magic-PDF\ocr_demo\ocr_1_org.pdf"
new_rgb
=
[]
for
item
in
rgb_config
:
item
=
float
(
item
)
/
255
new_rgb
.
append
(
item
)
page_data
=
bbox_list
[
i
]
for
bbox
in
page_data
:
x0
,
y0
,
x1
,
y1
=
bbox
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
doc
=
fitz
.
open
(
pdf_path
)
# Open the PDF
def
draw_layout_bbox
(
pdf_info_dict
,
input_path
,
out_path
):
# 你的数据
layout_bbox_list
=
[]
data
=
[[(
294.7569528415961
,
776.8430953398889
,
300.8827085852479
,
786.922616502779
),
(
460.1523579201934
,
776.8430953398889
,
509.51874244256345
,
787.2825994014537
)],
[(
294.03627569528413
,
779.7229585292861
,
301.24304715840384
,
788.3625480974777
),
(
85.76058041112454
,
781.882855921334
,
156.74727932285367
,
789.8024796921762
)],
[(
293.6759371221282
,
779.7229585292861
,
301.60338573155985
,
788.7225309961523
),
(
459.43168077388145
,
779.7229585292861
,
508.7980652962515
,
789.8024796921762
)],
[(
295.8379685610641
,
780.0829414279607
,
301.24304715840384
,
788.0025651988029
),
(
85.76058041112454
,
781.5228730226593
,
156.74727932285367
,
790.1624625908509
)],
[(
294.03627569528413
,
779.7229585292861
,
301.60338573155985
,
789.0825138948269
),
(
459.79201934703747
,
779.7229585292861
,
508.4377267230955
,
789.4424967935015
)],
[(
86.4812575574365
,
781.882855921334
,
156.0266021765417
,
789.8024796921762
)],
[(
294.39661426844015
,
779.7229585292861
,
301.24304715840384
,
788.3625480974777
),
(
459.43168077388145
,
779.7229585292861
,
508.7980652962515
,
789.4424967935015
)],
[(
294.03627569528413
,
779.7229585292861
,
301.24304715840384
,
788.3625480974777
),
(
85.76058041112454
,
781.5228730226593
,
156.74727932285367
,
789.8024796921762
)],
[(
294.39661426844015
,
779.7229585292861
,
300.8827085852479
,
788.3625480974777
)]]
for
page
in
pdf_info_dict
.
values
():
page_list
=
[]
for
layout
in
page
[
'layout_bboxes'
]:
page_list
.
append
(
layout
[
'layout_bbox'
])
layout_bbox_list
.
append
(
page_list
)
# 对每个页面进行处理
doc
=
fitz
.
open
(
input_path
)
for
i
,
page
in
enumerate
(
doc
):
for
i
,
page
in
enumerate
(
doc
):
# 获取当前页面的数据
# 获取当前页面的数据
page_data
=
data
[
i
]
page_data
=
layout_bbox_list
[
i
]
for
img
in
page_data
:
for
j
,
bbox
in
enumerate
(
page_data
):
# x0, y0, x1, y1, _ = img
x0
,
y0
,
x1
,
y1
=
bbox
x0
,
y0
,
x1
,
y1
=
img
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
(
1
,
0
,
0
),
fill
=
None
,
width
=
1.5
,
overlay
=
True
)
# Draw the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
(
1
,
0
,
0
),
fill
=
None
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
page
.
insert_text
((
x0
,
y0
),
str
(
j
+
1
),
fontsize
=
10
,
color
=
(
1
,
0
,
0
))
# Insert the index at the top left corner of the rectangle
# Save the PDF
doc
.
save
(
f
"{out_path}/layout.pdf"
)
def
draw_text_bbox
(
pdf_info_dict
,
input_path
,
out_path
):
text_list
=
[]
inline_equation_list
=
[]
displayed_equation_list
=
[]
for
page
in
pdf_info_dict
.
values
():
page_text_list
=
[]
page_inline_equation_list
=
[]
page_displayed_equation_list
=
[]
for
block
in
page
[
'preproc_blocks'
]:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
'text'
:
page_text_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
'inline_equation'
:
page_inline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
'displayed_equation'
:
page_displayed_equation_list
.
append
(
span
[
'bbox'
])
text_list
.
append
(
page_text_list
)
inline_equation_list
.
append
(
page_inline_equation_list
)
displayed_equation_list
.
append
(
page_displayed_equation_list
)
doc
=
fitz
.
open
(
input_path
)
for
i
,
page
in
enumerate
(
doc
):
# 获取当前页面的数据
draw_bbox
(
i
,
text_list
,
page
,
[
255
,
0
,
0
])
draw_bbox
(
i
,
inline_equation_list
,
page
,
[
0
,
255
,
0
])
draw_bbox
(
i
,
displayed_equation_list
,
page
,
[
0
,
0
,
255
])
# Save the PDF
# Save the PDF
doc
.
save
(
r"D:\projects\Magic-PDF\ocr_demo\ocr_1_new2.pdf"
)
doc
.
save
(
f
"{out_path}/text.pdf"
)
\ No newline at end of file
demo/ocr_demo.py
View file @
2e32ecfe
...
@@ -4,7 +4,7 @@ import os
...
@@ -4,7 +4,7 @@ import os
from
loguru
import
logger
from
loguru
import
logger
from
pathlib
import
Path
from
pathlib
import
Path
from
magic_pdf.dict2md.ocr_mkcontent
import
mk_nlp_markdown
from
magic_pdf.dict2md.ocr_mkcontent
import
mk_nlp_markdown
,
mk_mm_markdown
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
...
@@ -30,15 +30,20 @@ def read_json_file(file_path):
...
@@ -30,15 +30,20 @@ def read_json_file(file_path):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
ocr_pdf_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0_org.pdf"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0.json"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
ocr_pdf_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1.json"
try
:
try
:
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
book_name
=
pth
.
name
book_name
=
pth
.
name
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
save_path
=
join_path
(
save_tmp_path
,
"md"
)
save_path
=
join_path
(
save_tmp_path
,
"md"
)
text_content_save_path
=
f
"{save_path}/{book_name}/book.md"
save_path_with_bookname
=
os
.
path
.
join
(
save_path
,
book_name
)
text_content_save_path
=
f
"{save_path_with_bookname}/book.md"
pdf_info_dict
=
parse_pdf_by_ocr
(
pdf_info_dict
=
parse_pdf_by_ocr
(
ocr_pdf_path
,
ocr_pdf_path
,
None
,
None
,
...
@@ -46,11 +51,13 @@ if __name__ == '__main__':
...
@@ -46,11 +51,13 @@ if __name__ == '__main__':
save_path
,
save_path
,
book_name
,
book_name
,
debug_mode
=
True
)
debug_mode
=
True
)
parent_dir
=
os
.
path
.
dirname
(
text_content_save_path
)
parent_dir
=
os
.
path
.
dirname
(
text_content_save_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
os
.
makedirs
(
parent_dir
)
markdown_content
=
mk_nlp_markdown
(
pdf_info_dict
)
# markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content
=
mk_mm_markdown
(
pdf_info_dict
)
with
open
(
text_content_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
with
open
(
text_content_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
markdown_content
)
f
.
write
(
markdown_content
)
...
...
magic_pdf/dict2md/ocr_mkcontent.py
View file @
2e32ecfe
...
@@ -21,3 +21,31 @@ def mk_nlp_markdown(pdf_info_dict: dict):
...
@@ -21,3 +21,31 @@ def mk_nlp_markdown(pdf_info_dict: dict):
# 在行末添加两个空格以强制换行
# 在行末添加两个空格以强制换行
markdown
.
append
(
line_text
.
strip
()
+
' '
)
markdown
.
append
(
line_text
.
strip
()
+
' '
)
return
'
\n
'
.
join
(
markdown
)
return
'
\n
'
.
join
(
markdown
)
def
mk_mm_markdown
(
pdf_info_dict
:
dict
):
markdown
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
():
blocks
=
page_info
.
get
(
"preproc_blocks"
)
if
not
blocks
:
continue
for
block
in
blocks
:
for
line
in
block
[
'lines'
]:
line_text
=
''
for
span
in
line
[
'spans'
]:
if
not
span
.
get
(
'content'
):
if
not
span
.
get
(
'image_path'
):
continue
else
:
content
=
f
""
else
:
content
=
span
[
'content'
]
.
replace
(
'$'
,
'
\
$'
)
# 转义$
if
span
[
'type'
]
==
'inline_equation'
:
content
=
f
"${content}$"
elif
span
[
'type'
]
==
'displayed_equation'
:
content
=
f
"$$
\n
{content}
\n
$$"
line_text
+=
content
+
' '
# 在行末添加两个空格以强制换行
markdown
.
append
(
line_text
.
strip
()
+
' '
)
return
'
\n
'
.
join
(
markdown
)
magic_pdf/pdf_parse_by_ocr.py
View file @
2e32ecfe
...
@@ -4,6 +4,7 @@ import time
...
@@ -4,6 +4,7 @@ import time
from
loguru
import
logger
from
loguru
import
logger
from
demo.draw_bbox
import
draw_layout_bbox
,
draw_text_bbox
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
fitz
,
get_img_s3_client
,
get_delta_time
,
get_docx_model_output
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
fitz
,
get_img_s3_client
,
get_delta_time
,
get_docx_model_output
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.libs.safe_filename
import
sanitize_filename
...
@@ -185,14 +186,11 @@ def parse_pdf_by_ocr(
...
@@ -185,14 +186,11 @@ def parse_pdf_by_ocr(
# 在测试时,保存调试信息
# 在测试时,保存调试信息
if
debug_mode
:
if
debug_mode
:
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
)
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
)
page_draw_rect_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"layout.pdf"
)
with
open
(
params_file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
with
open
(
params_file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
# 先检测本地 page_draw_rect_save_path 是否存在,如果存在则删除
# drow_bbox
if
os
.
path
.
exists
(
page_draw_rect_save_path
):
draw_layout_bbox
(
pdf_info_dict
,
pdf_path
,
md_bookname_save_path
)
os
.
remove
(
page_draw_rect_save_path
)
draw_text_bbox
(
pdf_info_dict
,
pdf_path
,
md_bookname_save_path
)
# 绘制bbox和layout到pdf
return
pdf_info_dict
return
pdf_info_dict
magic_pdf/pre_proc/ocr_cut_image.py
View file @
2e32ecfe
...
@@ -12,8 +12,8 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
...
@@ -12,8 +12,8 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
for
span
in
spans
:
for
span
in
spans
:
span_type
=
span
[
'type'
]
span_type
=
span
[
'type'
]
if
span_type
==
'image'
:
if
span_type
==
'image'
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'image'
))
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'image
s
'
))
elif
span_type
==
'table'
:
elif
span_type
==
'table'
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'table'
))
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'table
s
'
))
return
spans
return
spans
magic_pdf/pre_proc/ocr_detect_layout.py
View file @
2e32ecfe
import
fitz
import
fitz
from
magic_pdf.layout.layout_sort
import
get_bboxes_layout
from
magic_pdf.libs.boxbase
import
_is_part_overlap
,
_is_in
from
magic_pdf.libs.boxbase
import
_is_part_overlap
,
_is_in
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
...
@@ -26,23 +27,16 @@ def get_area(bbox):
...
@@ -26,23 +27,16 @@ def get_area(bbox):
return
(
bbox
[
2
]
-
bbox
[
0
])
*
(
bbox
[
3
]
-
bbox
[
1
])
return
(
bbox
[
2
]
-
bbox
[
0
])
*
(
bbox
[
3
]
-
bbox
[
1
])
def
adjust_layouts
(
layout_bboxes
):
def
adjust_layouts
(
layout_bboxes
,
page_boundry
,
page_id
):
# 遍历所有布局框
# 遍历所有布局框
for
i
in
range
(
len
(
layout_bboxes
)):
for
i
in
range
(
len
(
layout_bboxes
)):
# 遍历当前布局框之后的布局框
# 遍历当前布局框之后的布局框
for
j
in
range
(
i
+
1
,
len
(
layout_bboxes
)):
for
j
in
range
(
i
+
1
,
len
(
layout_bboxes
)):
# 判断两个布局框是否重叠
# 判断两个布局框是否重叠
if
_is_part_overlap
(
layout_bboxes
[
i
]
[
"layout_bbox"
],
layout_bboxes
[
j
][
"layout_bbox"
]):
if
_is_part_overlap
(
layout_bboxes
[
i
]
,
layout_bboxes
[
j
]):
# 计算每个布局框的中心点坐标和面积
# 计算每个布局框的中心点坐标和面积
center_i
=
get_center_point
(
layout_bboxes
[
i
][
"layout_bbox"
])
area_i
=
get_area
(
layout_bboxes
[
i
])
area_i
=
get_area
(
layout_bboxes
[
i
][
"layout_bbox"
])
area_j
=
get_area
(
layout_bboxes
[
j
])
center_j
=
get_center_point
(
layout_bboxes
[
j
][
"layout_bbox"
])
area_j
=
get_area
(
layout_bboxes
[
j
][
"layout_bbox"
])
# 计算横向和纵向的距离差
dx
=
abs
(
center_i
[
0
]
-
center_j
[
0
])
dy
=
abs
(
center_i
[
1
]
-
center_j
[
1
])
# 较大布局框和较小布局框的赋值
# 较大布局框和较小布局框的赋值
if
area_i
>
area_j
:
if
area_i
>
area_j
:
...
@@ -50,19 +44,29 @@ def adjust_layouts(layout_bboxes):
...
@@ -50,19 +44,29 @@ def adjust_layouts(layout_bboxes):
else
:
else
:
larger_layout
,
smaller_layout
=
layout_bboxes
[
j
],
layout_bboxes
[
i
]
larger_layout
,
smaller_layout
=
layout_bboxes
[
j
],
layout_bboxes
[
i
]
center_large
=
get_center_point
(
larger_layout
)
center_small
=
get_center_point
(
smaller_layout
)
# 计算横向和纵向的距离差
distance_x
=
center_large
[
0
]
-
center_small
[
0
]
distance_y
=
center_large
[
1
]
-
center_small
[
1
]
# 根据距离差判断重叠方向并修正边界
# 根据距离差判断重叠方向并修正边界
if
dx
>
dy
:
# 左右重叠
if
abs
(
distance_x
)
>
abs
(
distance_y
)
:
# 左右重叠
if
larger_layout
[
"layout_bbox"
][
0
]
<
smaller_layout
[
"layout_bbox"
]
[
2
]:
if
distance_x
>
0
and
larger_layout
[
0
]
<
smaller_layout
[
2
]:
larger_layout
[
"layout_bbox"
][
0
]
=
smaller_layout
[
"layout_bbox"
][
2
]
larger_layout
[
0
]
=
smaller_layout
[
2
]
+
1
else
:
if
distance_x
<
0
and
larger_layout
[
2
]
>
smaller_layout
[
0
]
:
larger_layout
[
"layout_bbox"
][
2
]
=
smaller_layout
[
"layout_bbox"
][
0
]
larger_layout
[
2
]
=
smaller_layout
[
0
]
-
1
else
:
# 上下重叠
else
:
# 上下重叠
if
larger_layout
[
"layout_bbox"
][
1
]
<
smaller_layout
[
"layout_bbox"
][
3
]:
if
distance_y
>
0
and
larger_layout
[
1
]
<
smaller_layout
[
3
]:
larger_layout
[
"layout_bbox"
][
1
]
=
smaller_layout
[
"layout_bbox"
][
3
]
larger_layout
[
1
]
=
smaller_layout
[
3
]
+
1
else
:
if
distance_y
<
0
and
larger_layout
[
3
]
>
smaller_layout
[
1
]:
larger_layout
[
"layout_bbox"
][
3
]
=
smaller_layout
[
"layout_bbox"
][
1
]
larger_layout
[
3
]
=
smaller_layout
[
1
]
-
1
# todo 排序调整布局边界框列表
# 排序调整布局边界框列表
new_bboxes
=
[]
for
layout_bbox
in
layout_bboxes
:
new_bboxes
.
append
([
layout_bbox
[
0
],
layout_bbox
[
1
],
layout_bbox
[
2
],
layout_bbox
[
3
],
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
])
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
new_bboxes
,
page_boundry
,
page_id
)
# 返回排序调整后的布局边界框列表
# 返回排序调整后的布局边界框列表
return
layout_bboxes
return
layout_bboxes
...
@@ -79,6 +83,7 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
...
@@ -79,6 +83,7 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
"""
"""
page_id
=
ocr_page_info
[
'page_info'
][
'page_no'
]
-
1
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
ocr_page_info
,
page
)
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
ocr_page_info
,
page
)
# 初始化布局边界框列表
# 初始化布局边界框列表
layout_bboxes
=
[]
layout_bboxes
=
[]
...
@@ -88,12 +93,9 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
...
@@ -88,12 +93,9 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
sub_layout
[
'poly'
]
x0
,
y0
,
_
,
_
,
x1
,
y1
,
_
,
_
=
sub_layout
[
'poly'
]
bbox
=
[
int
(
x0
/
horizontal_scale_ratio
),
int
(
y0
/
vertical_scale_ratio
),
bbox
=
[
int
(
x0
/
horizontal_scale_ratio
),
int
(
y0
/
vertical_scale_ratio
),
int
(
x1
/
horizontal_scale_ratio
),
int
(
y1
/
vertical_scale_ratio
)]
int
(
x1
/
horizontal_scale_ratio
),
int
(
y1
/
vertical_scale_ratio
)]
# 创建子布局的边界框字典
layout_bbox
=
{
"layout_bbox"
:
bbox
,
}
# 将子布局的边界框添加到列表中
# 将子布局的边界框添加到列表中
layout_bboxes
.
append
(
layout_
bbox
)
layout_bboxes
.
append
(
bbox
)
# 初始化新的布局边界框列表
# 初始化新的布局边界框列表
new_layout_bboxes
=
[]
new_layout_bboxes
=
[]
...
@@ -102,14 +104,14 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
...
@@ -102,14 +104,14 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
# 初始化标记变量,用于判断当前边界框是否需要保留
# 初始化标记变量,用于判断当前边界框是否需要保留
keep
=
True
keep
=
True
# 获取当前边界框的坐标信息
# 获取当前边界框的坐标信息
box_i
=
layout_bboxes
[
i
]
[
"layout_bbox"
]
box_i
=
layout_bboxes
[
i
]
# 遍历其他边界框
# 遍历其他边界框
for
j
in
range
(
len
(
layout_bboxes
)):
for
j
in
range
(
len
(
layout_bboxes
)):
# 排除当前边界框自身
# 排除当前边界框自身
if
i
!=
j
:
if
i
!=
j
:
# 获取其他边界框的坐标信息
# 获取其他边界框的坐标信息
box_j
=
layout_bboxes
[
j
]
[
"layout_bbox"
]
box_j
=
layout_bboxes
[
j
]
# 检测box_i是否被box_j包含
# 检测box_i是否被box_j包含
if
_is_in
(
box_i
,
box_j
):
if
_is_in
(
box_i
,
box_j
):
# 如果当前边界框被其他边界框包含,则标记为不需要保留
# 如果当前边界框被其他边界框包含,则标记为不需要保留
...
@@ -122,7 +124,10 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
...
@@ -122,7 +124,10 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
new_layout_bboxes
.
append
(
layout_bboxes
[
i
])
new_layout_bboxes
.
append
(
layout_bboxes
[
i
])
# 对新的布局边界框列表进行排序调整
# 对新的布局边界框列表进行排序调整
layout_bboxes
=
adjust_layouts
(
new_layout_bboxes
)
page_width
=
page
.
rect
.
width
page_height
=
page
.
rect
.
height
page_boundry
=
[
0
,
0
,
page_width
,
page_height
]
layout_bboxes
=
adjust_layouts
(
new_layout_bboxes
,
page_boundry
,
page_id
)
# 返回排序调整后的布局边界框列表
# 返回排序调整后的布局边界框列表
return
layout_bboxes
return
layout_bboxes
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment