Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
8a2736a5
Commit
8a2736a5
authored
Mar 14, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
截图增加s3上传逻辑,移除宽或高为0的spans
parent
0b35b73c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
11 additions
and
11 deletions
+11
-11
ocr_demo.py
demo/ocr_demo.py
+7
-7
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+1
-1
ocr_cut_image.py
magic_pdf/pre_proc/ocr_cut_image.py
+3
-3
No files found.
demo/ocr_demo.py
View file @
8a2736a5
...
...
@@ -4,7 +4,7 @@ import os
from
loguru
import
logger
from
pathlib
import
Path
from
magic_pdf.dict2md.ocr_mkcontent
import
mk_nlp_markdown
,
mk_mm_markdown
from
magic_pdf.dict2md.ocr_mkcontent
import
ocr_mk_nlp_markdown
,
ocr_
mk_mm_markdown
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
...
...
@@ -30,12 +30,12 @@ def read_json_file(file_path):
if
__name__
==
'__main__'
:
#
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
#
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
ocr_pdf_path
=
r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
ocr_pdf_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
ocr_json_file_path
=
r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1
.json"
#
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_fix
.json"
try
:
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
...
...
@@ -56,8 +56,8 @@ if __name__ == '__main__':
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
# markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content
=
mk_mm_markdown
(
pdf_info_dict
)
# markdown_content =
ocr_
mk_nlp_markdown(pdf_info_dict)
markdown_content
=
ocr_
mk_mm_markdown
(
pdf_info_dict
)
with
open
(
text_content_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
markdown_content
)
...
...
magic_pdf/pdf_parse_by_ocr.py
View file @
8a2736a5
...
...
@@ -208,7 +208,7 @@ def parse_pdf_by_ocr(
spans
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
=
remove_spans_by_bboxes_dict
(
spans
,
need_remove_spans_bboxes_dict
)
# 对image和table截图
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
)
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
displayed_list
=
[]
...
...
magic_pdf/pre_proc/ocr_cut_image.py
View file @
8a2736a5
...
...
@@ -3,7 +3,7 @@ from magic_pdf.libs.ocr_content_type import ContentType
from
magic_pdf.libs.pdf_image_tools
import
cut_image
def
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
):
def
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
,
img_s3_client
):
def
s3_return_path
(
type
):
return
join_path
(
book_name
,
type
)
...
...
@@ -13,8 +13,8 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
for
span
in
spans
:
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Image
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'images'
))
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'images'
)
,
s3_return_path
=
s3_return_path
(
'images'
),
img_s3_client
=
img_s3_client
)
elif
span_type
==
ContentType
.
Table
:
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'tables'
))
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
img_save_path
(
'tables'
)
,
s3_return_path
=
s3_return_path
(
'tables'
),
img_s3_client
=
img_s3_client
)
return
spans
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment