Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
a5f8de98
Commit
a5f8de98
authored
Mar 08, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ocr模式增加截图功能
parent
68e83c12
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
43 additions
and
3 deletions
+43
-3
ocr_demo.py
demo/ocr_demo.py
+3
-2
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+21
-1
ocr_cut_image.py
magic_pdf/pre_proc/ocr_cut_image.py
+19
-0
No files found.
demo/ocr_demo.py
View file @
a5f8de98
...
...
@@ -30,8 +30,8 @@ def read_json_file(file_path):
if
__name__
==
'__main__'
:
ocr_pdf_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_
1
_org.pdf"
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_
1
.json"
ocr_pdf_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_
0
_org.pdf"
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_
0
.json"
try
:
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
...
...
@@ -43,6 +43,7 @@ if __name__ == '__main__':
ocr_pdf_path
,
None
,
ocr_pdf_model_info
,
save_path
,
book_name
,
debug_mode
=
True
)
parent_dir
=
os
.
path
.
dirname
(
text_content_save_path
)
...
...
magic_pdf/pdf_parse_by_ocr.py
View file @
a5f8de98
import
json
import
os
import
time
...
...
@@ -10,6 +11,7 @@ from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from
magic_pdf.pre_proc.detect_footnote
import
parse_footnotes_by_model
from
magic_pdf.pre_proc.detect_header
import
parse_headers
from
magic_pdf.pre_proc.detect_page_number
import
parse_pageNos
from
magic_pdf.pre_proc.ocr_cut_image
import
cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_dict_merge
import
remove_overlaps_min_spans
,
merge_spans_to_line_by_layout
from
magic_pdf.pre_proc.ocr_remove_spans
import
remove_spans_by_bboxes
...
...
@@ -28,6 +30,7 @@ def parse_pdf_by_ocr(
pdf_path
,
s3_pdf_profile
,
pdf_model_output
,
save_path
,
book_name
,
pdf_model_profile
=
None
,
image_s3_config
=
None
,
...
...
@@ -148,6 +151,10 @@ def parse_pdf_by_ocr(
# 删除remove_span_block_bboxes中的bbox
spans
=
remove_spans_by_bboxes
(
spans
,
need_remove_spans_bboxes
)
# 对image和table截图
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
# 模型识别错误的行间公式, type类型转换成行内公式
...
...
@@ -161,7 +168,7 @@ def parse_pdf_by_ocr(
# 将spans合并成line(在layout内,从上到下,从左到右)
lines
=
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
)
# logger.info(lines)
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks
=
[]
...
...
@@ -175,4 +182,17 @@ def parse_pdf_by_ocr(
page_info
=
construct_page_component
(
page_id
,
blocks
,
layout_bboxes
)
pdf_info_dict
[
f
"page_{page_id}"
]
=
page_info
# 在测试时,保存调试信息
if
debug_mode
:
params_file_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"preproc_out.json"
)
page_draw_rect_save_path
=
join_path
(
save_tmp_path
,
"md"
,
book_name
,
"layout.pdf"
)
with
open
(
params_file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
pdf_info_dict
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
# 先检测本地 page_draw_rect_save_path 是否存在,如果存在则删除
if
os
.
path
.
exists
(
page_draw_rect_save_path
):
os
.
remove
(
page_draw_rect_save_path
)
# 绘制bbox和layout到pdf
return
pdf_info_dict
magic_pdf/pre_proc/ocr_cut_image.py
0 → 100644
View file @
a5f8de98
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.pdf_image_tools
import
cut_image
def
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
):
def
s3_return_path
(
type
):
return
join_path
(
book_name
,
type
)
def
img_save_path
(
type
):
return
join_path
(
save_path
,
s3_return_path
(
type
))
for
span
in
spans
:
span_type
=
span
[
'type'
]
if
span_type
==
'image'
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'image'
))
elif
span_type
==
'table'
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'table'
))
return
spans
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment