Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
83deab21
Commit
83deab21
authored
Mar 11, 2024
by
liukaiwen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
lkw
parent
c38c784e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
65 additions
and
15 deletions
+65
-15
draw_bbox.py
demo/draw_bbox.py
+30
-5
ocr_dict_merge.py
magic_pdf/libs/ocr_dict_merge.py
+33
-8
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+2
-2
No files found.
demo/draw_bbox.py
View file @
83deab21
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
import
json
def
read_json_file
(
file_path
):
with
open
(
file_path
,
'r'
)
as
f
:
data
=
json
.
load
(
f
)
return
data
# PDF文件路径
pdf_path
=
"D:
\\
project
\\
20231108code-clean
\\
code-clean
\\
tmp
\\
unittest
\\
download-pdfs
\\
scihub
\\
scihub_53700000
\\
libgen.scimag53724000-53724999.zip_10.1097
\\
00129191-200509000-00018
.pdf"
pdf_path
=
"D:
\\
project
s
\\
Magic-PDF
\\
ocr_demo
\\
ocr_0_org
.pdf"
doc
=
fitz
.
open
(
pdf_path
)
# Open the PDF
# 你的数据
data
=
[[[
-
2
,
0
,
603
,
80
,
24
]],
[[
-
3
,
0
,
602
,
80
,
24
]]]
ocr_json_file_path
=
r"D:\projects\Magic-PDF\ocr_demo\ocr_0.json"
ocr_pdf_info
=
read_json_file
(
ocr_json_file_path
)
pdf_info_dict
=
parse_pdf_by_ocr
(
ocr_pdf_info
)
data_list
=
[]
for
page
in
pdf_info_dict
.
values
():
page_list
=
[]
blocks
=
page
.
get
(
"preproc_blocks"
)
for
block
in
blocks
:
lines
=
block
.
get
(
"lines"
)
for
line
in
lines
:
spans
=
line
.
get
(
"spans"
)
for
span
in
spans
:
page_list
.
append
(
span
[
"bbox"
])
data_list
.
append
(
page_list
)
# 对每个页面进行处理
for
i
,
page
in
enumerate
(
doc
):
# 获取当前页面的数据
page_data
=
data
[
i
]
page_data
=
data
_list
[
i
]
for
img
in
page_data
:
x0
,
y0
,
x1
,
y1
,
_
=
img
x0
,
y0
,
x1
,
y1
=
img
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
(
1
,
0
,
0
),
fill
=
None
,
width
=
1.5
,
overlay
=
True
)
# Draw the rectangle
# Save the PDF
doc
.
save
(
"D:
\\
project
\\
20231108code-clean
\\
code-clean
\\
tmp
\\
unittest
\\
download-pdfs
\\
scihub
\\
scihub_53700000
\\
libgen.scimag53724000-53724999.zip_10.1097
\\
00129191-200509000-00018_new.pdf"
)
\ No newline at end of file
doc
.
save
(
"D:
\\
projects
\\
Magic-PDF
\\
ocr_demo
\\
ocr_0_new.pdf"
)
\ No newline at end of file
magic_pdf/libs/ocr_dict_merge.py
View file @
83deab21
...
...
@@ -74,6 +74,7 @@ def modify_y_axis(spans: list):
current_line
=
[
spans
[
0
]]
if
spans
[
0
][
"type"
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
displayed_list
.
append
(
spans
[
0
])
line_first_y0
=
spans
[
0
][
"bbox"
][
1
]
line_first_y
=
spans
[
0
][
"bbox"
][
3
]
#用于给行间公式搜索
...
...
@@ -89,15 +90,16 @@ def modify_y_axis(spans: list):
# 则开始新行
lines
.
append
(
current_line
)
current_line
=
[
span
]
line_first_y0
=
span
s
[
0
]
[
"bbox"
][
1
]
line_first_y
=
span
s
[
0
]
[
"bbox"
][
3
]
line_first_y0
=
span
[
"bbox"
][
1
]
line_first_y
=
span
[
"bbox"
][
3
]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
]):
span
[
"bbox"
][
1
]
=
line_first_y0
span
[
"bbox"
][
3
]
=
line_first_y
if
span
[
"bbox"
][
1
]
<
line_first_y0
:
line_first_y0
=
span
[
"bbox"
][
1
]
if
span
[
"bbox"
][
3
]
>
line_first_y
:
line_first_y
=
span
[
"bbox"
][
3
]
current_line
.
append
(
span
)
else
:
...
...
@@ -111,18 +113,41 @@ def modify_y_axis(spans: list):
# 添加最后一行
if
current_line
:
lines
.
append
(
current_line
)
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
"text"
,
"inline_equation"
]:
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
for
line
in
text_inline_lines
:
# 按照x0坐标排序
line
.
sort
(
key
=
lambda
span
:
span
[
0
][
'bbox'
][
0
])
current_line
=
line
[
0
]
current_line
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
0
])
#调整每一个文字行内bbox统一
for
line
in
text_inline_lines
:
current_line
,
(
line_first_y0
,
line_first_y
)
=
line
for
span
in
current_line
:
span
[
"bbox"
][
1
]
=
line_first_y0
span
[
"bbox"
][
3
]
=
line_first_y
#错误行间公式转行内公式
j
=
0
for
i
in
range
(
len
(
displayed_list
)):
span
=
displayed_list
[
i
]
span_y0
,
span_y
=
span
[
"bbox"
][
1
],
span
[
"bbox"
][
3
]
while
j
<
len
(
text_inline_lines
):
text_line
=
text_inline_lines
[
j
]
y0
,
y1
=
text_line
[
1
]
if
span_y0
<
y0
and
span_y
>
y0
or
span_y0
<
y1
and
span_y
>
y1
or
span_y0
<
y0
and
span_y
>
y1
and
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
span
[
"bbox"
][
1
]
=
y0
span
[
"bbox"
][
3
]
=
y1
if
span
[
"type"
]
==
"displayed_equation"
:
span
[
"type"
]
=
"inline_equation"
break
elif
span_y
<
y0
or
span_y0
<
y0
and
span_y
>
y0
and
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
break
else
:
j
+=
1
return
spans
...
...
magic_pdf/pdf_parse_by_ocr.py
View file @
83deab21
from
loguru
import
logger
from
magic_pdf.libs.ocr_dict_merge
import
merge_spans_to_line
,
remove_overlaps_min_spans
from
magic_pdf.libs.ocr_dict_merge
import
merge_spans_to_line
,
remove_overlaps_min_spans
,
modify_y_axis
def
construct_page_component
(
page_id
,
blocks
):
...
...
@@ -68,7 +68,7 @@ def parse_pdf_by_ocr(
spans
=
remove_overlaps_min_spans
(
spans
)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
#spans = modify_y_axis(spans)
# 将spans合并成line(从上到下,从左到右)
lines
=
merge_spans_to_line
(
spans
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment