mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
31 lines
827 B
Python
31 lines
827 B
Python
# coding=utf-8
|
||
"""
|
||
@project: maxkb
|
||
@Author:虎
|
||
@file: tokenizer_manage_config.py
|
||
@date:2024/4/28 10:17
|
||
@desc:
|
||
"""
|
||
|
||
import os
|
||
|
||
class MKTokenizer:
|
||
def __init__(self, tokenizer):
|
||
self.tokenizer = tokenizer
|
||
|
||
def encode(self, text):
|
||
return self.tokenizer.encode(text).ids
|
||
|
||
|
||
class TokenizerManage:
|
||
tokenizer = None
|
||
|
||
@staticmethod
|
||
def get_tokenizer():
|
||
from tokenizers import Tokenizer
|
||
# 创建Tokenizer
|
||
model_path = os.path.join("/opt/maxkb-app", "model", "tokenizer", "models--bert-base-cased")
|
||
with open(f"{model_path}/refs/main", encoding="utf-8") as f: snapshot = f.read()
|
||
TokenizerManage.tokenizer = Tokenizer.from_file(f"{model_path}/snapshots/{snapshot}/tokenizer.json")
|
||
return MKTokenizer(TokenizerManage.tokenizer)
|