Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
7fcbae01
Commit
7fcbae01
authored
Mar 28, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
demo重构
parent
752d620a
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
45 additions
and
37 deletions
+45
-37
README.md
README.md
+3
-1
demo_commons.py
demo/demo_commons.py
+32
-0
ocr_demo.py
demo/ocr_demo.py
+8
-7
text_demo.py
demo/text_demo.py
+2
-29
No files found.
README.md
View file @
7fcbae01
...
...
@@ -28,7 +28,9 @@ pip install -r requirements.txt
3.
Run the main script
```
sh
use demo/demo_test.py
use demo/text_demo.py
or
use demo/ocr_demo.py
```
### 版权说明
...
...
demo/demo_commons.py
0 → 100644
View file @
7fcbae01
import
json
from
app.common.s3
import
get_s3_config
from
magic_pdf.libs.commons
import
join_path
,
read_file
,
json_dump_path
local_json_path
=
"Z:/format.json"
local_jsonl_path
=
"Z:/format.jsonl"
def
get_json_from_local_or_s3
(
book_name
=
None
):
if
book_name
is
None
:
with
open
(
local_json_path
,
"r"
,
encoding
=
"utf-8"
)
as
json_file
:
json_line
=
json_file
.
read
()
json_object
=
json
.
loads
(
json_line
)
else
:
# error_log_path & json_dump_path
# 可配置从上述两个地址获取源json
json_path
=
join_path
(
json_dump_path
,
book_name
+
".json"
)
s3_config
=
get_s3_config
(
json_path
)
file_content
=
read_file
(
json_path
,
s3_config
)
json_str
=
file_content
.
decode
(
"utf-8"
)
# logger.info(json_str)
json_object
=
json
.
loads
(
json_str
)
return
json_object
def
write_json_to_local
(
jso
,
book_name
=
None
):
if
book_name
is
None
:
with
open
(
local_json_path
,
"w"
,
encoding
=
"utf-8"
)
as
file
:
file
.
write
(
json
.
dumps
(
jso
,
ensure_ascii
=
False
))
else
:
pass
\ No newline at end of file
demo/ocr_demo.py
View file @
7fcbae01
...
...
@@ -5,7 +5,7 @@ from loguru import logger
from
pathlib
import
Path
from
app.common.s3
import
get_s3_config
from
demo.demo_
test
import
get_json_from_local_or_s3
from
demo.demo_
commons
import
get_json_from_local_or_s3
from
magic_pdf.dict2md.ocr_mkcontent
import
(
ocr_mk_mm_markdown_with_para
,
ocr_mk_nlp_markdown
,
...
...
@@ -14,7 +14,7 @@ from magic_pdf.dict2md.ocr_mkcontent import (
ocr_mk_mm_markdown_with_para_and_pagination
,
make_standard_format_with_para
)
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
,
read_file
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
...
...
@@ -43,7 +43,8 @@ def ocr_local_parse(ocr_pdf_path, ocr_json_file_path):
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
book_name
=
pth
.
name
ocr_parse_core
(
book_name
,
ocr_pdf_path
,
ocr_pdf_model_info
)
pdf_bytes
=
read_file
(
ocr_pdf_path
,
None
)
ocr_parse_core
(
book_name
,
pdf_bytes
,
ocr_pdf_model_info
)
except
Exception
as
e
:
logger
.
exception
(
e
)
...
...
@@ -54,20 +55,20 @@ def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
# logger.info(json_object)
s3_pdf_path
=
json_object
[
"file_location"
]
s3_config
=
get_s3_config
(
s3_pdf_path
)
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_config
)
ocr_pdf_model_info
=
json_object
.
get
(
"doc_layout_result"
)
ocr_parse_core
(
book_name
,
s3_pdf_path
,
ocr_pdf_model_info
,
s3_config
=
s3_config
)
ocr_parse_core
(
book_name
,
pdf_bytes
,
ocr_pdf_model_info
)
except
Exception
as
e
:
logger
.
exception
(
e
)
def
ocr_parse_core
(
book_name
,
ocr_pdf_path
,
ocr_pdf_model_info
,
start_page_id
=
0
,
s3_config
=
None
):
def
ocr_parse_core
(
book_name
,
pdf_bytes
,
ocr_pdf_model_info
,
start_page_id
=
0
):
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
save_path
=
join_path
(
save_tmp_path
,
"md"
)
save_path_with_bookname
=
os
.
path
.
join
(
save_path
,
book_name
)
text_content_save_path
=
f
"{save_path_with_bookname}/book.md"
pdf_info_dict
=
parse_pdf_by_ocr
(
ocr_pdf_path
,
s3_config
,
pdf_bytes
,
ocr_pdf_model_info
,
save_path
,
book_name
,
...
...
demo/
demo_test
.py
→
demo/
text_demo
.py
View file @
7fcbae01
...
...
@@ -5,6 +5,7 @@ from pathlib import Path
import
click
from
demo.demo_commons
import
get_json_from_local_or_s3
,
write_json_to_local
,
local_jsonl_path
,
local_json_path
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
from
magic_pdf.pipeline
import
(
meta_scan
,
...
...
@@ -13,38 +14,10 @@ from magic_pdf.pipeline import (
pdf_intermediate_dict_to_markdown
,
save_tables_to_s3
,
)
from
magic_pdf.libs.commons
import
join_path
,
read_file
,
json_dump_path
from
app.common.s3
import
get_s3_config
from
magic_pdf.libs.commons
import
join_path
from
loguru
import
logger
local_json_path
=
"Z:/format.json"
local_jsonl_path
=
"Z:/format.jsonl"
def
get_json_from_local_or_s3
(
book_name
=
None
):
if
book_name
is
None
:
with
open
(
local_json_path
,
"r"
,
encoding
=
"utf-8"
)
as
json_file
:
json_line
=
json_file
.
read
()
json_object
=
json
.
loads
(
json_line
)
else
:
# error_log_path & json_dump_path
# 可配置从上述两个地址获取源json
json_path
=
join_path
(
json_dump_path
,
book_name
+
".json"
)
s3_config
=
get_s3_config
(
json_path
)
file_content
=
read_file
(
json_path
,
s3_config
)
json_str
=
file_content
.
decode
(
"utf-8"
)
# logger.info(json_str)
json_object
=
json
.
loads
(
json_str
)
return
json_object
def
write_json_to_local
(
jso
,
book_name
=
None
):
if
book_name
is
None
:
with
open
(
local_json_path
,
"w"
,
encoding
=
"utf-8"
)
as
file
:
file
.
write
(
json
.
dumps
(
jso
,
ensure_ascii
=
False
))
else
:
pass
def
demo_parse_pdf
(
book_name
=
None
,
start_page_id
=
0
,
debug_mode
=
True
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment