Commit 1e73b9fc authored by myhloli's avatar myhloli

fix: fasttext not support numpy>=2.0.0

parent f14e50e2
import os import os
import json import json
from loguru import logger
from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
current_script_dir = os.path.dirname(os.path.abspath(__file__)) try:
demo_name = "demo1" current_script_dir = os.path.dirname(os.path.abspath(__file__))
pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf") demo_name = "demo1"
model_path = os.path.join(current_script_dir, f"{demo_name}.json") pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
pdf_bytes = open(pdf_path, "rb").read() model_path = os.path.join(current_script_dir, f"{demo_name}.json")
model_json = json.loads(open(model_path, "r", encoding="utf-8").read()) pdf_bytes = open(pdf_path, "rb").read()
jso_useful_key = {"_pdf_type": "", "model_list": model_json} model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
local_image_dir = os.path.join(current_script_dir, 'images') jso_useful_key = {"_pdf_type": "", "model_list": model_json}
image_dir = str(os.path.basename(local_image_dir)) local_image_dir = os.path.join(current_script_dir, 'images')
image_writer = DiskReaderWriter(local_image_dir) image_dir = str(os.path.basename(local_image_dir))
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) image_writer = DiskReaderWriter(local_image_dir)
pipe.pipe_classify() pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_parse() pipe.pipe_classify()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") pipe.pipe_parse()
with open(f"{demo_name}.md", "w", encoding="utf-8") as f: md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
f.write(md_content) f.write(md_content)
except Exception as e:
logger.exception(e)
\ No newline at end of file
import unicodedata import unicodedata
from fast_langdetect import detect_langs from fast_langdetect import detect_language
def detect_lang(text: str) -> str: def detect_lang(text: str) -> str:
if len(text) == 0: if len(text) == 0:
return "" return ""
try: try:
lang_upper = detect_langs(text) lang_upper = detect_language(text)
except: except:
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]]) html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
lang_upper = detect_langs(html_no_ctrl_chars) lang_upper = detect_language(html_no_ctrl_chars)
try: try:
lang = lang_upper.lower() lang = lang_upper.lower()
except: except:
......
...@@ -8,4 +8,5 @@ fast-langdetect>=0.1.1 ...@@ -8,4 +8,5 @@ fast-langdetect>=0.1.1
wordninja>=2.0.0 wordninja>=2.0.0
scikit-learn>=1.0.2 scikit-learn>=1.0.2
pdfminer.six>=20231228 pdfminer.six>=20231228
numpy<2.0.0 #2.0版本与fasttext不兼容
# requirements.txt 须保证只引入必需的外部依赖,如有新依赖添加请联系项目管理员 # requirements.txt 须保证只引入必需的外部依赖,如有新依赖添加请联系项目管理员
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment