Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
c3b8f6d7
Commit
c3b8f6d7
authored
Apr 09, 2024
by
kernel.h@qq.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
OCR line的左右侧如果超过layoutbox,那么让layoutbox截断左右侧
parent
ec187a1d
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
25 additions
and
6 deletions
+25
-6
ocr_demo.py
demo/ocr_demo.py
+3
-3
para_split.py
magic_pdf/para/para_split.py
+22
-3
No files found.
demo/ocr_demo.py
View file @
c3b8f6d7
...
...
@@ -115,8 +115,8 @@ def ocr_parse_pdf_core(pdf_bytes, model_output_json_list, book_name, start_page_
if
__name__
==
'__main__'
:
pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
#
ocr_local_parse(pdf_path, json_file_path)
book_name
=
"科数网/edu_00011318"
ocr_online_parse
(
book_name
)
ocr_local_parse
(
pdf_path
,
json_file_path
)
#
book_name = "科数网/edu_00011318"
#
ocr_online_parse(book_name)
pass
magic_pdf/para/para_split.py
View file @
c3b8f6d7
...
...
@@ -183,11 +183,31 @@ def __valign_lines(blocks, layout_bboxes):
return
new_layout_bboxes
def
__align_text_in_layout
(
blocks
,
layout_bboxes
):
"""
由于ocr出来的line,有时候会在前后有一段空白,这个时候需要对文本进行对齐,超出的部分被layout左右侧截断。
"""
for
layout
in
layout_bboxes
:
lb
=
layout
[
'layout_bbox'
]
blocks_in_layoutbox
=
[
b
for
b
in
blocks
if
is_in_layout
(
b
[
'bbox'
],
lb
)]
if
len
(
blocks_in_layoutbox
)
==
0
:
continue
for
block
in
blocks_in_layoutbox
:
for
line
in
block
[
'lines'
]:
x0
,
x1
=
line
[
'bbox'
][
0
],
line
[
'bbox'
][
2
]
if
x0
<
lb
[
0
]:
line
[
'bbox'
][
0
]
=
lb
[
0
]
if
x1
>
lb
[
2
]:
line
[
'bbox'
][
2
]
=
lb
[
2
]
def
__common_pre_proc
(
blocks
,
layout_bboxes
):
"""
不分语言的,对文本进行预处理
"""
#__add_line_period(blocks, layout_bboxes)
__align_text_in_layout
(
blocks
,
layout_bboxes
)
aligned_layout_bboxes
=
__valign_lines
(
blocks
,
layout_bboxes
)
return
aligned_layout_bboxes
...
...
@@ -233,7 +253,6 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
layout_paras
=
[]
right_tail_distance
=
1.5
*
char_avg_len
for
lines
in
lines_group
:
paras
=
[]
total_lines
=
len
(
lines
)
...
...
@@ -575,8 +594,8 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
return
connected_layout_paras
,
page_list_info
def
para_split
(
pdf_info_dict
,
debug_mode
,
lang
=
"en"
):
"""
根据line和layout情况进行分段
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment