Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
d062bb6c
Commit
d062bb6c
authored
Mar 20, 2024
by
kernel.h@qq.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
merge
parent
a4a9fd69
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
15 additions
and
12 deletions
+15
-12
ocr_demo.py
demo/ocr_demo.py
+9
-7
para_split.py
magic_pdf/para/para_split.py
+6
-5
No files found.
demo/ocr_demo.py
View file @
d062bb6c
...
...
@@ -83,11 +83,13 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
if
__name__
==
'__main__'
:
# pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
# json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
# pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
# json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
# ocr_local_parse(pdf_path, json_file_path)
#ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
#ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
ocr_pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
ocr_json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
# ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
ocr_online_parse
(
book_name
=
"数学新星网/edu_00001236"
)
magic_pdf/para/para_split.py
View file @
d062bb6c
...
...
@@ -2,7 +2,7 @@ from sklearn.cluster import DBSCAN
import
numpy
as
np
from
loguru
import
logger
from
magic_pdf.libs.boxbase
import
_is_in
from
magic_pdf.libs.boxbase
import
_is_in
_or_part_overlap
from
magic_pdf.libs.ocr_content_type
import
ContentType
...
...
@@ -50,7 +50,7 @@ def __valign_lines(blocks, layout_bboxes):
new_layout_bboxes
=
[]
for
layout_box
in
layout_bboxes
:
blocks_in_layoutbox
=
[
b
for
b
in
blocks
if
_is_in
(
b
[
'bbox'
],
layout_box
[
'layout_bbox'
])]
blocks_in_layoutbox
=
[
b
for
b
in
blocks
if
_is_in
_or_part_overlap
(
b
[
'bbox'
],
layout_box
[
'layout_bbox'
])]
if
len
(
blocks_in_layoutbox
)
==
0
:
continue
...
...
@@ -136,7 +136,7 @@ def __group_line_by_layout(blocks, layout_bboxes, lang="en"):
lines_group
=
[]
for
lyout
in
layout_bboxes
:
lines
=
[
line
for
block
in
blocks
if
_is_in
(
block
[
'bbox'
],
lyout
[
'layout_bbox'
])
for
line
in
block
[
'lines'
]]
lines
=
[
line
for
block
in
blocks
if
_is_in
_or_part_overlap
(
block
[
'bbox'
],
lyout
[
'layout_bbox'
])
for
line
in
block
[
'lines'
]]
lines_group
.
append
(
lines
)
return
lines_group
...
...
@@ -159,6 +159,7 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
continue
#layout_right = max([line['bbox'][2] for line in lines])
layout_right
=
__find_layout_bbox_by_line
(
lines
[
0
][
'bbox'
],
new_layout_bbox
)[
2
]
layout_left
=
__find_layout_bbox_by_line
(
lines
[
0
][
'bbox'
],
new_layout_bbox
)[
0
]
para
=
[]
# 元素是line
for
i
,
line
in
enumerate
(
lines
):
...
...
@@ -173,7 +174,7 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
para
.
append
(
line
)
paras
.
append
(
para
)
para
=
[]
elif
line
[
'bbox'
][
2
]
>=
layout_right
-
right_tail_distance
and
next_line
and
next_line
[
'bbox'
][
0
]
==
layout_
righ
t
:
# 现在这行到了行尾沾满,下一行存在且顶格。
elif
line
[
'bbox'
][
2
]
>=
layout_right
-
right_tail_distance
and
next_line
and
next_line
[
'bbox'
][
0
]
==
layout_
lef
t
:
# 现在这行到了行尾沾满,下一行存在且顶格。
para
.
append
(
line
)
else
:
para
.
append
(
line
)
...
...
@@ -197,7 +198,7 @@ def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
根据line找到所在的layout
"""
for
layout
in
layout_bboxes
:
if
_is_in
(
line_bbox
,
layout
):
if
_is_in
_or_part_overlap
(
line_bbox
,
layout
):
return
layout
return
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment