Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
2277e31f
Commit
2277e31f
authored
Mar 22, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ocr_demo main函数精简
parent
7d010e19
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
15 additions
and
13 deletions
+15
-13
ocr_demo.py
demo/ocr_demo.py
+15
-13
No files found.
demo/ocr_demo.py
View file @
2277e31f
...
@@ -6,7 +6,13 @@ from pathlib import Path
...
@@ -6,7 +6,13 @@ from pathlib import Path
from
app.common.s3
import
get_s3_config
from
app.common.s3
import
get_s3_config
from
demo.demo_test
import
get_json_from_local_or_s3
from
demo.demo_test
import
get_json_from_local_or_s3
from
magic_pdf.dict2md.ocr_mkcontent
import
ocr_mk_mm_markdown_with_para
,
ocr_mk_nlp_markdown
,
ocr_mk_mm_markdown
,
ocr_mk_mm_standard_format
from
magic_pdf.dict2md.ocr_mkcontent
import
(
ocr_mk_mm_markdown_with_para
,
ocr_mk_nlp_markdown
,
ocr_mk_mm_markdown
,
ocr_mk_mm_standard_format
,
ocr_mk_mm_markdown_with_para_and_pagination
)
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
...
@@ -47,7 +53,7 @@ def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
...
@@ -47,7 +53,7 @@ def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
# logger.info(json_object)
# logger.info(json_object)
s3_pdf_path
=
json_object
[
"file_location"
]
s3_pdf_path
=
json_object
[
"file_location"
]
s3_config
=
get_s3_config
(
s3_pdf_path
)
s3_config
=
get_s3_config
(
s3_pdf_path
)
ocr_pdf_model_info
=
json_object
[
"doc_layout_result"
]
ocr_pdf_model_info
=
json_object
.
get
(
"doc_layout_result"
)
ocr_parse_core
(
book_name
,
s3_pdf_path
,
ocr_pdf_model_info
,
s3_config
=
s3_config
)
ocr_parse_core
(
book_name
,
s3_pdf_path
,
ocr_pdf_model_info
,
s3_config
=
s3_config
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
exception
(
e
)
logger
.
exception
(
e
)
...
@@ -72,6 +78,7 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
...
@@ -72,6 +78,7 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
# markdown_content = mk_nlp_markdown(pdf_info_dict)
# markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content
=
ocr_mk_mm_markdown_with_para
(
pdf_info_dict
)
markdown_content
=
ocr_mk_mm_markdown_with_para
(
pdf_info_dict
)
# markdown_pagination = ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict)
with
open
(
text_content_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
with
open
(
text_content_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
markdown_content
)
f
.
write
(
markdown_content
)
...
@@ -83,14 +90,9 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
...
@@ -83,14 +90,9 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
#ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
# pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
#ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_local_parse(pdf_path, json_file_path)
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
# book_name = "数学新星网/edu_00001236"
# ocr_online_parse(book_name)
ocr_pdf_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
pass
ocr_json_file_path
=
r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
# ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
ocr_local_parse
(
ocr_pdf_path
,
ocr_json_file_path
)
#ocr_online_parse(book_name="美国加州中学教材/edu_00000060")
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment