Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
61a0c62c
Commit
61a0c62c
authored
Mar 12, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
pdf_info_dict中间态结构调整
parent
f31117de
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
13 additions
and
7 deletions
+13
-7
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+10
-4
ocr_detect_layout.py
magic_pdf/pre_proc/ocr_detect_layout.py
+3
-3
No files found.
magic_pdf/pdf_parse_by_ocr.py
View file @
61a0c62c
...
@@ -18,11 +18,13 @@ from magic_pdf.pre_proc.ocr_dict_merge import remove_overlaps_min_spans, merge_s
...
@@ -18,11 +18,13 @@ from magic_pdf.pre_proc.ocr_dict_merge import remove_overlaps_min_spans, merge_s
from
magic_pdf.pre_proc.ocr_remove_spans
import
remove_spans_by_bboxes
from
magic_pdf.pre_proc.ocr_remove_spans
import
remove_spans_by_bboxes
def
construct_page_component
(
page_id
,
blocks
,
layout_bboxes
):
def
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
):
return_dict
=
{
return_dict
=
{
'preproc_blocks'
:
blocks
,
'preproc_blocks'
:
blocks
,
'page_idx'
:
page_id
,
'layout_bboxes'
:
layout_bboxes
,
'layout_bboxes'
:
layout_bboxes
,
'page_idx'
:
page_id
,
'page_size'
:
[
page_w
,
page_h
],
'_layout_tree'
:
layout_tree
,
}
}
return
return_dict
return
return_dict
...
@@ -73,6 +75,9 @@ def parse_pdf_by_ocr(
...
@@ -73,6 +75,9 @@ def parse_pdf_by_ocr(
# 获取当前页的page对象
# 获取当前页的page对象
page
=
pdf_docs
[
page_id
]
page
=
pdf_docs
[
page_id
]
# 获取当前页的宽高
page_w
=
page
.
rect
.
width
page_h
=
page
.
rect
.
height
if
debug_mode
:
if
debug_mode
:
time_now
=
time
.
time
()
time_now
=
time
.
time
()
...
@@ -165,7 +170,7 @@ def parse_pdf_by_ocr(
...
@@ -165,7 +170,7 @@ def parse_pdf_by_ocr(
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
],
page
,
ocr_page_info
)
layout_bboxes
,
layout_tree
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
],
page
,
ocr_page_info
)
# 将spans合并成line(在layout内,从上到下,从左到右)
# 将spans合并成line(在layout内,从上到下,从左到右)
lines
=
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
)
lines
=
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
)
...
@@ -180,7 +185,7 @@ def parse_pdf_by_ocr(
...
@@ -180,7 +185,7 @@ def parse_pdf_by_ocr(
})
})
# 构造pdf_info_dict
# 构造pdf_info_dict
page_info
=
construct_page_component
(
page_id
,
blocks
,
layout_bboxes
)
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
# 在测试时,保存调试信息
# 在测试时,保存调试信息
...
@@ -188,6 +193,7 @@ def parse_pdf_by_ocr(
...
@@ -188,6 +193,7 @@ def parse_pdf_by_ocr(
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
)
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
)
with
open
(
params_file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
with
open
(
params_file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
# drow_bbox
# drow_bbox
draw_layout_bbox
(
pdf_info_dict
,
pdf_path
,
md_bookname_save_path
)
draw_layout_bbox
(
pdf_info_dict
,
pdf_path
,
md_bookname_save_path
)
draw_text_bbox
(
pdf_info_dict
,
pdf_path
,
md_bookname_save_path
)
draw_text_bbox
(
pdf_info_dict
,
pdf_path
,
md_bookname_save_path
)
...
...
magic_pdf/pre_proc/ocr_detect_layout.py
View file @
61a0c62c
...
@@ -69,7 +69,7 @@ def adjust_layouts(layout_bboxes, page_boundry, page_id):
...
@@ -69,7 +69,7 @@ def adjust_layouts(layout_bboxes, page_boundry, page_id):
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
new_bboxes
,
page_boundry
,
page_id
)
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
new_bboxes
,
page_boundry
,
page_id
)
# 返回排序调整后的布局边界框列表
# 返回排序调整后的布局边界框列表
return
layout_bboxes
return
layout_bboxes
,
layout_tree
def
layout_detect
(
layout_info
,
page
:
fitz
.
Page
,
ocr_page_info
):
def
layout_detect
(
layout_info
,
page
:
fitz
.
Page
,
ocr_page_info
):
...
@@ -127,7 +127,7 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
...
@@ -127,7 +127,7 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
page_width
=
page
.
rect
.
width
page_width
=
page
.
rect
.
width
page_height
=
page
.
rect
.
height
page_height
=
page
.
rect
.
height
page_boundry
=
[
0
,
0
,
page_width
,
page_height
]
page_boundry
=
[
0
,
0
,
page_width
,
page_height
]
layout_bboxes
=
adjust_layouts
(
new_layout_bboxes
,
page_boundry
,
page_id
)
layout_bboxes
,
layout_tree
=
adjust_layouts
(
new_layout_bboxes
,
page_boundry
,
page_id
)
# 返回排序调整后的布局边界框列表
# 返回排序调整后的布局边界框列表
return
layout_bboxes
return
layout_bboxes
,
layout_tree
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment