Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
e492b3dc
Commit
e492b3dc
authored
Apr 11, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
语言检测逻辑移动到parse流程
parent
1b9d65b3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
35 additions
and
16 deletions
+35
-16
detect_language_from_model.py
magic_pdf/libs/detect_language_from_model.py
+21
-0
UNIPipe.py
magic_pdf/spark/UNIPipe.py
+14
-16
No files found.
magic_pdf/libs/detect_language_from_model.py
0 → 100644
View file @
e492b3dc
from
collections
import
Counter
from
magic_pdf.libs.language
import
detect_lang
def
get_language_from_model
(
model_list
:
list
):
language_lst
=
[]
for
ocr_page_info
in
model_list
:
page_text
=
""
layout_dets
=
ocr_page_info
[
"layout_dets"
]
for
layout_det
in
layout_dets
:
category_id
=
layout_det
[
"category_id"
]
allow_category_id_list
=
[
15
]
if
category_id
in
allow_category_id_list
:
page_text
+=
layout_det
[
"text"
]
page_language
=
detect_lang
(
page_text
)
language_lst
.
append
(
page_language
)
# 统计text_language_list中每种语言的个数
count_dict
=
Counter
(
language_lst
)
# 输出text_language_list中出现的次数最多的语言
language
=
max
(
count_dict
,
key
=
count_dict
.
get
)
return
language
magic_pdf/spark/UNIPipe.py
View file @
e492b3dc
...
...
@@ -4,6 +4,7 @@ from magic_pdf.dict2md.mkcontent import mk_universal_format
from
magic_pdf.dict2md.ocr_mkcontent
import
make_standard_format_with_para
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
magic_pdf.libs.detect_language_from_model
import
get_language_from_model
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.spark.spark_api
import
parse_union_pdf
,
parse_ocr_pdf
...
...
@@ -36,16 +37,7 @@ class UNIPipe:
pdf_meta
[
"text_layout_per_page"
],
)
if
is_text_pdf
:
allow_language
=
[
"zh"
,
"en"
]
# 允许的语言,目前只允许简中和英文的
text_language
=
pdf_meta
[
"text_language"
]
logger
.
info
(
f
"pdf meta_scan text_language is {text_language}"
)
if
text_language
not
in
allow_language
:
# 如果语言不在允许的语言中,则drop
if
text_language
==
"un"
:
# unknow的话可能是中文乱码,可以尝试用ocr识别
return
"ocr"
else
:
raise
Exception
(
f
"pdf meta_scan need_drop,reason is {DropReason.NOT_ALLOW_LANGUAGE}"
)
else
:
return
"txt"
return
"txt"
else
:
return
"ocr"
...
...
@@ -53,13 +45,19 @@ class UNIPipe:
"""
根据pdf类型,解析pdf
"""
if
jso_useful_key
[
'_pdf_type'
]
==
"txt"
:
pdf_mid_data
=
parse_union_pdf
(
pdf_bytes
,
jso_useful_key
[
'model_list'
],
image_writer
)
elif
jso_useful_key
[
'_pdf_type'
]
==
"ocr"
:
pdf_mid_data
=
parse_ocr_pdf
(
pdf_bytes
,
jso_useful_key
[
'model_list'
],
image_writer
)
text_language
=
get_language_from_model
(
jso_useful_key
[
'model_list'
])
allow_language
=
[
"zh"
,
"en"
]
# 允许的语言,目前只允许简中和英文的
logger
.
info
(
f
"pdf text_language is {text_language}"
)
if
text_language
not
in
allow_language
:
# 如果语言不在允许的语言中,则drop
raise
Exception
(
f
"pdf meta_scan need_drop,reason is {DropReason.NOT_ALLOW_LANGUAGE}"
)
else
:
raise
Exception
(
f
"pdf type is not txt or ocr"
)
return
JsonCompressor
.
compress
(
pdf_mid_data
)
if
jso_useful_key
[
'_pdf_type'
]
==
"txt"
:
pdf_mid_data
=
parse_union_pdf
(
pdf_bytes
,
jso_useful_key
[
'model_list'
],
image_writer
)
elif
jso_useful_key
[
'_pdf_type'
]
==
"ocr"
:
pdf_mid_data
=
parse_ocr_pdf
(
pdf_bytes
,
jso_useful_key
[
'model_list'
],
image_writer
)
else
:
raise
Exception
(
f
"pdf type is not txt or ocr"
)
return
JsonCompressor
.
compress
(
pdf_mid_data
)
def
mk_uni_format
(
self
,
pdf_mid_data
:
str
,
img_buket_path
:
str
)
->
list
:
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment