refactor: change model path in MKTokenizer.

This commit is contained in:
liqiang-fit2cloud 2025-11-07 14:57:04 +08:00 committed by CaptainB
parent f6ebaa7cac
commit bb57783c08
6 changed files with 4 additions and 29027 deletions

View File

@ -6,11 +6,8 @@
@date2024/4/28 10:17
@desc:
"""
import os
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent.parent.parent
class MKTokenizer:
def __init__(self, tokenizer):
@ -27,6 +24,7 @@ class TokenizerManage:
def get_tokenizer():
from tokenizers import Tokenizer
# 创建Tokenizer
s = os.path.join(BASE_DIR.parent, 'tokenizer', 'bert-base-cased', 'tokenizer.json')
TokenizerManage.tokenizer = Tokenizer.from_file(s)
model_path = os.path.join("/opt/maxkb-app", "model", "tokenizer", "models--bert-base-cased")
with open(f"{model_path}/refs/main", encoding="utf-8") as f: snapshot = f.read()
TokenizerManage.tokenizer = Tokenizer.from_file(f"{model_path}/snapshots/{snapshot}/tokenizer.json")
return MKTokenizer(TokenizerManage.tokenizer)

View File

@ -1,23 +0,0 @@
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.6.0.dev0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 28996
}

File diff suppressed because one or more lines are too long

View File

@ -1 +0,0 @@
{"do_lower_case": false, "model_max_length": 512}

File diff suppressed because it is too large Load Diff