From f6c70da5ffd701c8de438ec766b7c890d3682b7c Mon Sep 17 00:00:00 2001 From: liqiang-fit2cloud Date: Fri, 7 Nov 2025 15:34:36 +0800 Subject: [PATCH] refactor: add cl100k_base.tiktoken tokenizer. --- .github/workflows/build-and-push-vector-model.yml | 2 +- installer/Dockerfile-base | 2 +- installer/Dockerfile-vector-model | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-and-push-vector-model.yml b/.github/workflows/build-and-push-vector-model.yml index 556a398b8..0c51e86f6 100644 --- a/.github/workflows/build-and-push-vector-model.yml +++ b/.github/workflows/build-and-push-vector-model.yml @@ -5,7 +5,7 @@ on: inputs: dockerImageTag: description: 'Docker Image Tag' - default: 'v2.0.2' + default: 'v2.0.3' required: true architecture: description: 'Architecture' diff --git a/installer/Dockerfile-base b/installer/Dockerfile-base index 11d057bca..176809062 100644 --- a/installer/Dockerfile-base +++ b/installer/Dockerfile-base @@ -1,7 +1,7 @@ FROM python:3.11-slim-trixie AS python-stage RUN python3 -m venv /opt/py3 -FROM ghcr.io/1panel-dev/maxkb-vector-model:v2.0.2 AS vector-model +FROM ghcr.io/1panel-dev/maxkb-vector-model:v2.0.3 AS vector-model FROM postgres:17.6-trixie COPY --from=python-stage /usr/local /usr/local diff --git a/installer/Dockerfile-vector-model b/installer/Dockerfile-vector-model index c73e03079..6001ace55 100644 --- a/installer/Dockerfile-vector-model +++ b/installer/Dockerfile-vector-model @@ -25,7 +25,10 @@ COPY --from=vector-model /opt/maxkb/app/model /opt/maxkb-app/model COPY --from=vector-model /opt/maxkb/app/model/base/hub /opt/maxkb-app/model/tokenizer COPY --from=tmp-stage1 model/tokenizer /opt/maxkb-app/model/tokenizer RUN rm -rf /opt/maxkb-app/model/embedding/shibing624_text2vec-base-chinese/onnx - +RUN apk add --update --no-cache curl && \ + mkdir -p openai-tiktoken-cl100k-base && \ + curl -Lf https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken > openai-tiktoken-cl100k-base/cl100k_base.tiktoken && \ + mv -f openai-tiktoken-cl100k-base /opt/maxkb-app/model/tokenizer/ FROM scratch