mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 10:12:51 +00:00
feat: Using the HuggingFace tokenizer (#4329)
This commit is contained in:
parent
df1f1d6bd2
commit
bb58bbbf46
|
|
@ -6,6 +6,18 @@
|
|||
@date:2024/4/28 10:17
|
||||
@desc:
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent.parent
|
||||
|
||||
|
||||
class MKTokenizer:
|
||||
def __init__(self, tokenizer):
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
def encode(self, text):
|
||||
return self.tokenizer.encode(text).ids
|
||||
|
||||
|
||||
class TokenizerManage:
|
||||
|
|
@ -13,12 +25,8 @@ class TokenizerManage:
|
|||
|
||||
@staticmethod
|
||||
def get_tokenizer():
|
||||
from transformers import BertTokenizer
|
||||
if TokenizerManage.tokenizer is None:
|
||||
TokenizerManage.tokenizer = BertTokenizer.from_pretrained(
|
||||
'bert-base-cased',
|
||||
cache_dir="/opt/maxkb-app/model/tokenizer",
|
||||
local_files_only=True,
|
||||
resume_download=False,
|
||||
force_download=False)
|
||||
return TokenizerManage.tokenizer
|
||||
from tokenizers import Tokenizer
|
||||
# 创建Tokenizer
|
||||
s = os.path.join(BASE_DIR.parent, 'tokenizer', 'bert-base-cased', 'tokenizer.json')
|
||||
TokenizerManage.tokenizer = Tokenizer.from_file(s)
|
||||
return MKTokenizer(TokenizerManage.tokenizer)
|
||||
|
|
|
|||
|
|
@ -6,9 +6,7 @@
|
|||
@date:2024/4/18 15:28
|
||||
@desc:
|
||||
"""
|
||||
from typing import List, Dict
|
||||
|
||||
from langchain_core.messages import BaseMessage, get_buffer_string
|
||||
from typing import Dict
|
||||
|
||||
from common.config.tokenizer_manage_config import TokenizerManage
|
||||
from models_provider.base_model_provider import MaxKBBaseModel
|
||||
|
|
|
|||
|
|
@ -0,0 +1,23 @@
|
|||
{
|
||||
"architectures": [
|
||||
"BertForMaskedLM"
|
||||
],
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"gradient_checkpointing": false,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 768,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 3072,
|
||||
"layer_norm_eps": 1e-12,
|
||||
"max_position_embeddings": 512,
|
||||
"model_type": "bert",
|
||||
"num_attention_heads": 12,
|
||||
"num_hidden_layers": 12,
|
||||
"pad_token_id": 0,
|
||||
"position_embedding_type": "absolute",
|
||||
"transformers_version": "4.6.0.dev0",
|
||||
"type_vocab_size": 2,
|
||||
"use_cache": true,
|
||||
"vocab_size": 28996
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1 @@
|
|||
{"do_lower_case": false, "model_max_length": 512}
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue