Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
94a7ba3d
Commit
94a7ba3d
authored
Mar 12, 2024
by
liukaiwen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
lkw
parent
da509143
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
32 additions
and
13 deletions
+32
-13
draw_bbox.py
demo/draw_bbox.py
+17
-3
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+2
-3
ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+13
-7
No files found.
demo/draw_bbox.py
View file @
94a7ba3d
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
pathlib
import
Path
from
magic_pdf.libs.commons
import
fitz
,
join_path
# PyMuPDF
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
import
json
import
json
import
os
...
@@ -20,7 +22,19 @@ doc = fitz.open(pdf_path) # Open the PDF
...
@@ -20,7 +22,19 @@ doc = fitz.open(pdf_path) # Open the PDF
data
=
[[[
-
2
,
0
,
603
,
80
,
24
]],
[[
-
3
,
0
,
602
,
80
,
24
]]]
data
=
[[[
-
2
,
0
,
603
,
80
,
24
]],
[[
-
3
,
0
,
602
,
80
,
24
]]]
ocr_json_file_path
=
r"D:\projects\Magic-PDF\ocr_demo\ocr_0.json"
ocr_json_file_path
=
r"D:\projects\Magic-PDF\ocr_demo\ocr_0.json"
ocr_pdf_info
=
read_json_file
(
ocr_json_file_path
)
ocr_pdf_info
=
read_json_file
(
ocr_json_file_path
)
pdf_info_dict
=
parse_pdf_by_ocr
(
ocr_pdf_info
)
pth
=
Path
(
ocr_json_file_path
)
book_name
=
pth
.
name
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
save_path
=
join_path
(
save_tmp_path
,
"md"
)
pdf_info_dict
=
parse_pdf_by_ocr
(
pdf_path
,
None
,
ocr_pdf_info
,
save_path
,
book_name
,
debug_mode
=
True
)
data_list
=
[]
data_list
=
[]
for
page
in
pdf_info_dict
.
values
():
for
page
in
pdf_info_dict
.
values
():
page_list
=
[]
page_list
=
[]
...
...
magic_pdf/pdf_parse_by_ocr.py
View file @
94a7ba3d
...
@@ -4,7 +4,6 @@ import time
...
@@ -4,7 +4,6 @@ import time
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.ocr_dict_merge
import
merge_spans_to_line
,
remove_overlaps_min_spans
,
modify_y_axis
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
fitz
,
get_img_s3_client
,
get_delta_time
,
get_docx_model_output
from
magic_pdf.libs.commons
import
read_file
,
join_path
,
fitz
,
get_img_s3_client
,
get_delta_time
,
get_docx_model_output
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.safe_filename
import
sanitize_filename
from
magic_pdf.libs.safe_filename
import
sanitize_filename
...
@@ -14,7 +13,7 @@ from magic_pdf.pre_proc.detect_header import parse_headers
...
@@ -14,7 +13,7 @@ from magic_pdf.pre_proc.detect_header import parse_headers
from
magic_pdf.pre_proc.detect_page_number
import
parse_pageNos
from
magic_pdf.pre_proc.detect_page_number
import
parse_pageNos
from
magic_pdf.pre_proc.ocr_cut_image
import
cut_image_and_table
from
magic_pdf.pre_proc.ocr_cut_image
import
cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_dict_merge
import
remove_overlaps_min_spans
,
merge_spans_to_line_by_layout
from
magic_pdf.pre_proc.ocr_dict_merge
import
remove_overlaps_min_spans
,
merge_spans_to_line_by_layout
,
modify_y_axis
from
magic_pdf.pre_proc.ocr_remove_spans
import
remove_spans_by_bboxes
from
magic_pdf.pre_proc.ocr_remove_spans
import
remove_spans_by_bboxes
...
@@ -150,7 +149,7 @@ def parse_pdf_by_ocr(
...
@@ -150,7 +149,7 @@ def parse_pdf_by_ocr(
spans
=
remove_overlaps_min_spans
(
spans
)
spans
=
remove_overlaps_min_spans
(
spans
)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
#
spans = modify_y_axis(spans)
spans
=
modify_y_axis
(
spans
)
# 删除remove_span_block_bboxes中的bbox
# 删除remove_span_block_bboxes中的bbox
spans
=
remove_spans_by_bboxes
(
spans
,
need_remove_spans_bboxes
)
spans
=
remove_spans_by_bboxes
(
spans
,
need_remove_spans_bboxes
)
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
94a7ba3d
...
@@ -113,15 +113,19 @@ def modify_y_axis(spans: list):
...
@@ -113,15 +113,19 @@ def modify_y_axis(spans: list):
#用于给行间公式搜索
#用于给行间公式搜索
text_inline_lines
=
[]
text_inline_lines
=
[]
for
span
in
spans
[
1
:]:
for
span
in
spans
[
1
:]:
if
span
.
get
(
"content"
,
""
)
==
"78."
:
print
(
"debug"
)
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# image和table类型,同上
# image和table类型,同上
if
span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
or
any
(
if
span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
or
any
(
s
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
for
s
in
current_line
):
s
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
for
s
in
current_line
):
#传入
#传入
if
span
s
[
0
]
[
"type"
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
if
span
[
"type"
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
displayed_list
.
append
(
span
)
displayed_list
.
append
(
span
)
# 则开始新行
# 则开始新行
lines
.
append
(
current_line
)
lines
.
append
(
current_line
)
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
"text"
,
"inline_equation"
]:
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
current_line
=
[
span
]
current_line
=
[
span
]
line_first_y0
=
span
[
"bbox"
][
1
]
line_first_y0
=
span
[
"bbox"
][
1
]
line_first_y
=
span
[
"bbox"
][
3
]
line_first_y
=
span
[
"bbox"
][
3
]
...
@@ -140,15 +144,14 @@ def modify_y_axis(spans: list):
...
@@ -140,15 +144,14 @@ def modify_y_axis(spans: list):
lines
.
append
(
current_line
)
lines
.
append
(
current_line
)
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
current_line
=
[
span
]
current_line
=
[
span
]
line_first_y0
=
span
s
[
0
]
[
"bbox"
][
1
]
line_first_y0
=
span
[
"bbox"
][
1
]
line_first_y
=
span
s
[
0
]
[
"bbox"
][
3
]
line_first_y
=
span
[
"bbox"
][
3
]
# 添加最后一行
# 添加最后一行
if
current_line
:
if
current_line
:
lines
.
append
(
current_line
)
lines
.
append
(
current_line
)
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
"text"
,
"inline_equation"
]:
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
"text"
,
"inline_equation"
]:
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
for
line
in
text_inline_lines
:
for
line
in
text_inline_lines
:
# 按照x0坐标排序
# 按照x0坐标排序
current_line
=
line
[
0
]
current_line
=
line
[
0
]
...
@@ -164,14 +167,17 @@ def modify_y_axis(spans: list):
...
@@ -164,14 +167,17 @@ def modify_y_axis(spans: list):
#错误行间公式转行内公式
#错误行间公式转行内公式
j
=
0
j
=
0
for
i
in
range
(
len
(
displayed_list
)):
for
i
in
range
(
len
(
displayed_list
)):
if
i
==
8
:
print
(
"debug"
)
span
=
displayed_list
[
i
]
span
=
displayed_list
[
i
]
span_y0
,
span_y
=
span
[
"bbox"
][
1
],
span
[
"bbox"
][
3
]
span_y0
,
span_y
=
span
[
"bbox"
][
1
],
span
[
"bbox"
][
3
]
while
j
<
len
(
text_inline_lines
):
while
j
<
len
(
text_inline_lines
):
text_line
=
text_inline_lines
[
j
]
text_line
=
text_inline_lines
[
j
]
y0
,
y1
=
text_line
[
1
]
y0
,
y1
=
text_line
[
1
]
if
span_y0
<
y0
and
span_y
>
y0
or
span_y0
<
y1
and
span_y
>
y1
or
span_y0
<
y0
and
span_y
>
y1
and
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
if
(
span_y0
<
y0
and
span_y
>
y0
or
span_y0
<
y1
and
span_y
>
y1
or
span_y0
<
y0
and
span_y
>
y1
)
and
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
span
[
"bbox"
][
1
]
=
y0
span
[
"bbox"
][
1
]
=
y0
span
[
"bbox"
][
3
]
=
y1
#
span["bbox"][3] = y1
if
span
[
"type"
]
==
"displayed_equation"
:
if
span
[
"type"
]
==
"displayed_equation"
:
span
[
"type"
]
=
"inline_equation"
span
[
"type"
]
=
"inline_equation"
break
break
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment