feat: Using the HuggingFace tokenizer (#4329)

This commit is contained in:
shaohuzhang1 2025-11-07 13:29:17 +08:00 committed by CaptainB
parent df1f1d6bd2
commit bb58bbbf46
7 changed files with 29039 additions and 12 deletions

View File

@ -6,6 +6,18 @@
@date2024/4/28 10:17
@desc:
"""
import os
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent.parent.parent
class MKTokenizer:
def __init__(self, tokenizer):
self.tokenizer = tokenizer
def encode(self, text):
return self.tokenizer.encode(text).ids
class TokenizerManage:
@ -13,12 +25,8 @@ class TokenizerManage:
@staticmethod
def get_tokenizer():
from transformers import BertTokenizer
if TokenizerManage.tokenizer is None:
TokenizerManage.tokenizer = BertTokenizer.from_pretrained(
'bert-base-cased',
cache_dir="/opt/maxkb-app/model/tokenizer",
local_files_only=True,
resume_download=False,
force_download=False)
return TokenizerManage.tokenizer
from tokenizers import Tokenizer
# 创建Tokenizer
s = os.path.join(BASE_DIR.parent, 'tokenizer', 'bert-base-cased', 'tokenizer.json')
TokenizerManage.tokenizer = Tokenizer.from_file(s)
return MKTokenizer(TokenizerManage.tokenizer)

View File

@ -6,9 +6,7 @@
@date2024/4/18 15:28
@desc:
"""
from typing import List, Dict
from langchain_core.messages import BaseMessage, get_buffer_string
from typing import Dict
from common.config.tokenizer_manage_config import TokenizerManage
from models_provider.base_model_provider import MaxKBBaseModel

View File

@ -0,0 +1,23 @@
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.6.0.dev0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 28996
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"do_lower_case": false, "model_max_length": 512}

File diff suppressed because it is too large Load Diff