Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
7162debc
Commit
7162debc
authored
Mar 26, 2024
by
kernel.h@qq.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
实现文本拼PDF解析结果装标准格式
parent
a343175d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
59 additions
and
14 deletions
+59
-14
pipeline.py
magic_pdf/pipeline.py
+1
-14
pipeline_txt.py
magic_pdf/pipeline_txt.py
+37
-0
base.py
magic_pdf/spark/base.py
+21
-0
No files found.
magic_pdf/pipeline.py
View file @
7162debc
...
...
@@ -23,17 +23,11 @@ from loguru import logger
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_for_train
import
parse_pdf_for_train
from
magic_pdf.spark.base
import
exception_handler
,
get_data_source
from
magic_pdf.train_utils.convert_to_train_format
import
convert_to_train_format
from
app.common.s3
import
get_s3_config
,
get_s3_client
def
exception_handler
(
jso
:
dict
,
e
):
logger
.
exception
(
e
)
jso
[
"need_drop"
]
=
True
jso
[
"drop_reason"
]
=
DropReason
.
Exception
jso
[
"exception"
]
=
f
"ERROR: {e}"
return
jso
def
get_data_type
(
jso
:
dict
):
data_type
=
jso
.
get
(
"data_type"
)
...
...
@@ -49,13 +43,6 @@ def get_bookid(jso: dict):
return
book_id
def
get_data_source
(
jso
:
dict
):
data_source
=
jso
.
get
(
"data_source"
)
if
data_source
is
None
:
data_source
=
jso
.
get
(
"file_source"
)
return
data_source
def
meta_scan
(
jso
:
dict
,
doc_layout_check
=
True
)
->
dict
:
s3_pdf_path
=
jso
.
get
(
"file_location"
)
s3_config
=
get_s3_config
(
s3_pdf_path
)
...
...
magic_pdf/pipeline_txt.py
0 → 100644
View file @
7162debc
"""
文本型pdf转化为统一清洗格式
"""
from
loguru
import
logger
from
magic_pdf.dict2md.mkcontent
import
mk_universal_format
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.spark.base
import
exception_handler
,
get_data_source
def
txt_pdf_to_standard_format
(
jso
:
dict
,
debug_mode
=
False
)
->
dict
:
if
debug_mode
:
pass
else
:
# 如果debug没开,则检测是否有needdrop字段
if
jso
.
get
(
"need_drop"
,
False
):
book_name
=
join_path
(
get_data_source
(
jso
),
jso
[
"file_id"
])
logger
.
info
(
f
"book_name is:{book_name} need drop"
)
jso
[
"dropped"
]
=
True
return
jso
try
:
pdf_intermediate_dict
=
jso
[
"pdf_intermediate_dict"
]
# 将 pdf_intermediate_dict 解压
pdf_intermediate_dict
=
JsonCompressor
.
decompress_json
(
pdf_intermediate_dict
)
standard_format
=
mk_universal_format
(
pdf_intermediate_dict
)
jso
[
"content_list"
]
=
standard_format
logger
.
info
(
f
"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}"
,)
# 把无用的信息清空
jso
[
"doc_layout_result"
]
=
""
jso
[
"pdf_intermediate_dict"
]
=
""
jso
[
"pdf_meta"
]
=
""
except
Exception
as
e
:
jso
=
exception_handler
(
jso
,
e
)
return
jso
magic_pdf/spark/base.py
0 → 100644
View file @
7162debc
from
loguru
import
logger
from
magic_pdf.libs.drop_reason
import
DropReason
def
get_data_source
(
jso
:
dict
):
data_source
=
jso
.
get
(
"data_source"
)
if
data_source
is
None
:
data_source
=
jso
.
get
(
"file_source"
)
return
data_source
def
exception_handler
(
jso
:
dict
,
e
):
logger
.
exception
(
e
)
jso
[
"need_drop"
]
=
True
jso
[
"drop_reason"
]
=
DropReason
.
Exception
jso
[
"exception"
]
=
f
"ERROR: {e}"
return
jso
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment