Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pdf-miner
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Qin Kaijie
pdf-miner
Commits
959b8d82
Commit
959b8d82
authored
Jun 25, 2024
by
赵小蒙
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
renamed pipeline file name
parent
c9af3457
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
23 additions
and
97 deletions
+23
-97
pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+1
-0
pdf_parse_by_txt.py
magic_pdf/pdf_parse_by_txt.py
+19
-0
pdf_parse_by_txt_v2.py
magic_pdf/pdf_parse_by_txt_v2.py
+0
-56
user_api.py
magic_pdf/user_api.py
+3
-41
No files found.
magic_pdf/pdf_parse_by_ocr
_v2
.py
→
magic_pdf/pdf_parse_by_ocr.py
View file @
959b8d82
from
magic_pdf.pdf_parse_union_core
import
pdf_parse_union
from
magic_pdf.pdf_parse_union_core
import
pdf_parse_union
def
parse_pdf_by_ocr
(
pdf_bytes
,
def
parse_pdf_by_ocr
(
pdf_bytes
,
model_list
,
model_list
,
imageWriter
,
imageWriter
,
...
...
magic_pdf/pdf_parse_by_txt.py
0 → 100644
View file @
959b8d82
from
magic_pdf.pdf_parse_union_core
import
pdf_parse_union
def
parse_pdf_by_txt
(
pdf_bytes
,
model_list
,
imageWriter
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
):
return
pdf_parse_union
(
pdf_bytes
,
model_list
,
imageWriter
,
"txt"
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
debug_mode
=
debug_mode
,
)
magic_pdf/pdf_parse_by_txt_v2.py
deleted
100644 → 0
View file @
c9af3457
from
magic_pdf.pdf_parse_union_core
import
pdf_parse_union
def
parse_pdf_by_txt
(
pdf_bytes
,
model_list
,
imageWriter
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
):
return
pdf_parse_union
(
pdf_bytes
,
model_list
,
imageWriter
,
"txt"
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
debug_mode
=
debug_mode
,
)
if
__name__
==
"__main__"
:
pass
# if 1:
# import fitz
# import json
#
# with open("/opt/data/pdf/20240418/25536-00.pdf", "rb") as f:
# pdf_bytes = f.read()
# pdf_docs = fitz.open("pdf", pdf_bytes)
#
# with open("/opt/data/pdf/20240418/25536-00.json") as f:
# model_list = json.loads(f.readline())
#
# magic_model = MagicModel(model_list, pdf_docs)
# for i in range(7):
# print(magic_model.get_imgs(i))
#
# for page_no, page in enumerate(pdf_docs):
# inline_equations, interline_equations, interline_equation_blocks = (
# magic_model.get_equations(page_no)
# )
#
# text_raw_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
# char_level_text_blocks = page.get_text(
# "rawdict", flags=fitz.TEXTFLAGS_TEXT
# )["blocks"]
# text_blocks = combine_chars_to_pymudict(
# text_raw_blocks, char_level_text_blocks
# )
# text_blocks = replace_equations_in_textblock(
# text_blocks, inline_equations, interline_equations
# )
# text_blocks = remove_citation_marker(text_blocks)
#
# text_blocks = remove_chars_in_text_blocks(text_blocks)
magic_pdf/user_api.py
View file @
959b8d82
...
@@ -18,8 +18,8 @@ from loguru import logger
...
@@ -18,8 +18,8 @@ from loguru import logger
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.rw
import
AbsReaderWriter
from
magic_pdf.rw
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr
_v2
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_txt
_v2
import
parse_pdf_by_txt
from
magic_pdf.pdf_parse_by_txt
import
parse_pdf_by_txt
PARSE_TYPE_TXT
=
"txt"
PARSE_TYPE_TXT
=
"txt"
PARSE_TYPE_OCR
=
"ocr"
PARSE_TYPE_OCR
=
"ocr"
...
@@ -86,45 +86,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
...
@@ -86,45 +86,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
return
None
return
None
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_txt
)
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_txt
)
# text_all = ""
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"_need_drop"
,
False
):
# for page_dict in pdf_info_dict['pdf_info']:
# for para_block in page_dict['para_blocks']:
# if para_block['type'] in ['title', 'text']:
# for line in para_block['lines']:
# for span in line['spans']:
# text_all += span['content']
# def calculate_not_common_character_rate(text):
# garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
# # 计算乱码字符的数量
# garbage_count = len(garbage_regex.findall(text))
# total = len(text)
# if total == 0:
# return 0 # 避免除以零的错误
# return garbage_count / total
#
# def calculate_not_printable_rate(text):
# printable_text = ""
# for c in text:
# if c.isprintable():
# printable_text += c
# printable_total = len(printable_text)
# total = len(text)
# if total == 0:
# return 0 # 避免除以零的错误
# return (total - printable_total) / total
#
# not_common_character_rate = calculate_not_common_character_rate(text_all)
# not_printable_rate = calculate_not_printable_rate(text_all)
# pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
# pdf_info_dict["_not_printable_rate"] = not_printable_rate
# logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
'''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
# not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
if
(
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"_need_drop"
,
False
)
# or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
):
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
if
input_model_is_empty
:
if
input_model_is_empty
:
pdf_models
=
doc_analyze
(
pdf_bytes
,
ocr
=
True
)
pdf_models
=
doc_analyze
(
pdf_bytes
,
ocr
=
True
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment