Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
9b5b1163
Commit
9b5b1163
authored
Jun 05, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix: change garbled_rate 0.1 -> 0.02
parent
c50fa4dc
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
22 additions
and
7 deletions
+22
-7
magicpdf.py
magic_pdf/cli/magicpdf.py
+13
-0
user_api.py
magic_pdf/user_api.py
+9
-4
setup.py
setup.py
+0
-3
No files found.
magic_pdf/cli/magicpdf.py
View file @
9b5b1163
...
...
@@ -44,6 +44,7 @@ from magic_pdf.libs.config_reader import get_local_dir
from
magic_pdf.rw.S3ReaderWriter
import
S3ReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
import
csv
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
...
...
@@ -60,6 +61,15 @@ def prepare_env(pdf_file_name, method):
return
local_image_dir
,
local_md_dir
def
write_to_csv
(
csv_file_path
,
csv_data
):
with
open
(
csv_file_path
,
mode
=
'a'
,
newline
=
''
,
encoding
=
'utf-8'
)
as
csvfile
:
# 创建csv writer对象
csv_writer
=
csv
.
writer
(
csvfile
)
# 写入数据
csv_writer
.
writerow
(
csv_data
)
print
(
f
"数据已成功追加到 '{csv_file_path}'"
)
def
_do_parse
(
pdf_file_name
,
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
,
local_md_dir
):
if
parse_method
==
"auto"
:
jso_useful_key
=
{
...
...
@@ -81,6 +91,9 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
# write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv",
# [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
md_content
=
pipe
.
pipe_mk_markdown
(
image_dir
,
drop_mode
=
DropMode
.
NONE
)
md_writer
.
write
(
content
=
md_content
,
path
=
f
"{pdf_file_name}.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
...
...
magic_pdf/user_api.py
View file @
9b5b1163
"""
用户输入:
model数组,每个元素代表一个页面
...
...
@@ -24,6 +23,7 @@ from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
PARSE_TYPE_TXT
=
"txt"
PARSE_TYPE_OCR
=
"ocr"
def
parse_txt_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page
=
0
,
*
args
,
**
kwargs
):
"""
...
...
@@ -108,11 +108,16 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
return
0
# 避免除以零的错误
return
(
total
-
printable
)
/
total
#
not_common_character_rate = calculate_not_common_character_rate(text_all)
not_common_character_rate
=
calculate_not_common_character_rate
(
text_all
)
not_printable_rate
=
calculate_not_printable_rate
(
text_all
)
# 测试乱码pdf,not_common_character_rate > 0.95, not_printable_rate > 0.15
pdf_info_dict
[
"_not_common_character_rate"
]
=
not_common_character_rate
pdf_info_dict
[
"_not_printable_rate"
]
=
not_printable_rate
logger
.
info
(
f
"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}"
)
# not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"_need_drop"
,
False
)
or
not_printable_rate
>
0.1
:
if
(
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"_need_drop"
,
False
)
or
not_printable_rate
>
0.02
# 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
):
logger
.
warning
(
f
"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr"
)
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
if
pdf_info_dict
is
None
:
...
...
setup.py
View file @
9b5b1163
import
os
import
subprocess
from
setuptools
import
setup
,
find_packages
from
magic_pdf.libs.version
import
__version__
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment