Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
c38c784e
Commit
c38c784e
authored
Mar 08, 2024
by
liukaiwen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
lkw
parent
00f3e329
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
70 additions
and
1 deletion
+70
-1
ocr_demo.py
demo/ocr_demo.py
+1
-1
ocr_dict_merge.py
magic_pdf/libs/ocr_dict_merge.py
+69
-0
No files found.
demo/ocr_demo.py
View file @
c38c784e
...
...
@@ -28,7 +28,7 @@ def read_json_file(file_path):
if
__name__
==
'__main__'
:
ocr_json_file_path
=
r"D:\project
\20231108code-clean\ocr\new\demo_4
\ocr_0.json"
ocr_json_file_path
=
r"D:\project
s\Magic-PDF\ocr_demo
\ocr_0.json"
ocr_pdf_info
=
read_json_file
(
ocr_json_file_path
)
pdf_info_dict
=
parse_pdf_by_ocr
(
ocr_pdf_info
)
markdown_text
=
mk_nlp_markdown
(
pdf_info_dict
)
...
...
magic_pdf/libs/ocr_dict_merge.py
View file @
c38c784e
...
...
@@ -58,3 +58,72 @@ def merge_spans_to_line(spans):
})
return
line_objects
def
modify_y_axis
(
spans
:
list
):
inline_list
=
[]
displayed_list
=
[]
text_list
=
[]
image_list
=
[]
table_list
=
[]
spans
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
1
])
lines
=
[]
current_line
=
[
spans
[
0
]]
if
spans
[
0
][
"type"
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
displayed_list
.
append
(
spans
[
0
])
line_first_y0
=
spans
[
0
][
"bbox"
][
1
]
line_first_y
=
spans
[
0
][
"bbox"
][
3
]
#用于给行间公式搜索
text_inline_lines
=
[]
for
span
in
spans
[
1
:]:
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# image和table类型,同上
if
span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
or
any
(
s
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
for
s
in
current_line
):
#传入
if
spans
[
0
][
"type"
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
displayed_list
.
append
(
span
)
# 则开始新行
lines
.
append
(
current_line
)
current_line
=
[
span
]
line_first_y0
=
spans
[
0
][
"bbox"
][
1
]
line_first_y
=
spans
[
0
][
"bbox"
][
3
]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
]):
span
[
"bbox"
][
1
]
=
line_first_y0
span
[
"bbox"
][
3
]
=
line_first_y
current_line
.
append
(
span
)
else
:
# 否则,开始新行
lines
.
append
(
current_line
)
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
current_line
=
[
span
]
line_first_y0
=
spans
[
0
][
"bbox"
][
1
]
line_first_y
=
spans
[
0
][
"bbox"
][
3
]
# 添加最后一行
if
current_line
:
lines
.
append
(
current_line
)
for
line
in
text_inline_lines
:
# 按照x0坐标排序
line
.
sort
(
key
=
lambda
span
:
span
[
0
][
'bbox'
][
0
])
#错误行间公式转行内公式
for
i
in
range
(
len
(
displayed_list
)):
span
=
displayed_list
[
i
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment