Commit 959b8d82 authored by 赵小蒙's avatar 赵小蒙

renamed pipeline file name

parent c9af3457
from magic_pdf.pdf_parse_union_core import pdf_parse_union
def parse_pdf_by_ocr(pdf_bytes,
model_list,
imageWriter,
......
from magic_pdf.pdf_parse_union_core import pdf_parse_union
def parse_pdf_by_txt(
pdf_bytes,
model_list,
imageWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
):
return pdf_parse_union(pdf_bytes,
model_list,
imageWriter,
"txt",
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=debug_mode,
)
from magic_pdf.pdf_parse_union_core import pdf_parse_union
def parse_pdf_by_txt(
pdf_bytes,
model_list,
imageWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
):
return pdf_parse_union(pdf_bytes,
model_list,
imageWriter,
"txt",
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=debug_mode,
)
if __name__ == "__main__":
pass
# if 1:
# import fitz
# import json
#
# with open("/opt/data/pdf/20240418/25536-00.pdf", "rb") as f:
# pdf_bytes = f.read()
# pdf_docs = fitz.open("pdf", pdf_bytes)
#
# with open("/opt/data/pdf/20240418/25536-00.json") as f:
# model_list = json.loads(f.readline())
#
# magic_model = MagicModel(model_list, pdf_docs)
# for i in range(7):
# print(magic_model.get_imgs(i))
#
# for page_no, page in enumerate(pdf_docs):
# inline_equations, interline_equations, interline_equation_blocks = (
# magic_model.get_equations(page_no)
# )
#
# text_raw_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
# char_level_text_blocks = page.get_text(
# "rawdict", flags=fitz.TEXTFLAGS_TEXT
# )["blocks"]
# text_blocks = combine_chars_to_pymudict(
# text_raw_blocks, char_level_text_blocks
# )
# text_blocks = replace_equations_in_textblock(
# text_blocks, inline_equations, interline_equations
# )
# text_blocks = remove_citation_marker(text_blocks)
#
# text_blocks = remove_chars_in_text_blocks(text_blocks)
......@@ -18,8 +18,8 @@ from loguru import logger
from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
PARSE_TYPE_TXT = "txt"
PARSE_TYPE_OCR = "ocr"
......@@ -86,45 +86,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
# text_all = ""
# for page_dict in pdf_info_dict['pdf_info']:
# for para_block in page_dict['para_blocks']:
# if para_block['type'] in ['title', 'text']:
# for line in para_block['lines']:
# for span in line['spans']:
# text_all += span['content']
# def calculate_not_common_character_rate(text):
# garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
# # 计算乱码字符的数量
# garbage_count = len(garbage_regex.findall(text))
# total = len(text)
# if total == 0:
# return 0 # 避免除以零的错误
# return garbage_count / total
#
# def calculate_not_printable_rate(text):
# printable_text = ""
# for c in text:
# if c.isprintable():
# printable_text += c
# printable_total = len(printable_text)
# total = len(text)
# if total == 0:
# return 0 # 避免除以零的错误
# return (total - printable_total) / total
#
# not_common_character_rate = calculate_not_common_character_rate(text_all)
# not_printable_rate = calculate_not_printable_rate(text_all)
# pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
# pdf_info_dict["_not_printable_rate"] = not_printable_rate
# logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
'''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
# not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
if (pdf_info_dict is None
or pdf_info_dict.get("_need_drop", False)
# or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
):
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
if input_model_is_empty:
pdf_models = doc_analyze(pdf_bytes, ocr=True)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment