FastGPT/packages/service/core/dataset/training/controller.ts
Archer c51395b2c8
V4.12.0 features (#5435)
* add logs chart (#5352)

* charts

* chart data

* log chart

* delete

* rename api

* fix

* move api

* fix

* fix

* pro config

* fix

* feat: Repository interaction (#5356)

* feat: 1好像功能没问题了,明天再测

* feat: 2 解决了昨天遗留的bug,但全选按钮又bug了

* feat: 3 第三版,解决了全选功能bug

* feat: 4 第四版,下面改小细节

* feat: 5 我勒个痘

* feat: 6

* feat: 6 pr

* feat: 7

* feat: 8

* feat: 9

* feat: 10

* feat: 11

* feat: 12

* perf: checkbox ui

* refactor: tweak login loyout (#5357)

Co-authored-by: Archer <545436317@qq.com>

* login ui

* app chat log chart pro display (#5392)

* app chat log chart pro display

* add canopen props

* perf: pro tag tip

* perf: pro tag tip

* feat: openrouter provider (#5406)

* perf: login ui

* feat: openrouter provider

* provider

* perf: custom error throw

* perf: emb batch (#5407)

* perf: emb batch

* perf: vector retry

* doc

* doc (#5411)

* doc

* fix: team folder will add to workflow

* fix: generateToc shell

* Tool price (#5376)

* resolve conflicts for cherry-pick

* fix i18n

* Enhance system plugin template data structure and update ToolSelectModal to include CostTooltip component

* refactor: update systemKeyCost type to support array of objects in plugin and workflow types

* refactor: simplify systemKeyCost type across plugin and workflow types to a single number

* refactor: streamline systemKeyCost handling in plugin and workflow components

* fix

* fix

* perf: toolset price config;fix: workflow array selector ui (#5419)

* fix: workflow array selector ui

* update default model tip

* perf: toolset price config

* doc

* fix: test

* Refactor/chat (#5418)

* refactor: add homepage configuration; add home chat page; add side bar animated collapse and layout

* fix: fix lint rules

* chore: improve logics and code

* chore: more clearer logics

* chore: adjust api

---------

Co-authored-by: Archer <545436317@qq.com>

* perf: chat setting code

* del history

* logo image

* perf: home chat ui

* feat: enhance chat response handling with external links and user info (#5427)

* feat: enhance chat response handling with external links and user info

* fix

* cite code

* perf: toolset add in workflow

* fix: test

* fix: search paraentId

* Fix/chat (#5434)

* wip: rebase了upstream

* wip: adapt mobile UI

* fix: fix chat page logic and UI

* fix: fix UI and improve some logics

* fix: model selector missing logo; vision model to retrieve file

* perf: role selector

* fix: chat ui

* optimize export app chat log (#5436)

* doc

* chore: move components to proper directory; fix the api to get app list (#5437)

* chore: improve team app panel display form (#5438)

* feat: add home chat log tab

* chore: improve team app panel display form

* chore: improve log panel

* fix: spec

* doc

* fix: log permission

* fix: dataset schema required

* add loading status

* remove ui weight

* manage log

* fix: log detail per

* doc

* fix: log menu

* rename permission

* bg color

* fix: app log per

* fix: log key selector

* fix: log

* doc

---------

Co-authored-by: heheer <zhiyu44@qq.com>
Co-authored-by: colnii <1286949794@qq.com>
Co-authored-by: 伍闲犬 <76519998+xqvvu@users.noreply.github.com>
Co-authored-by: Ctrlz <143257420+ctrlz526@users.noreply.github.com>
Co-authored-by: 伍闲犬 <whoeverimf5@gmail.com>
Co-authored-by: heheer <heheer@sealos.io>
2025-08-12 22:22:18 +08:00

192 lines
4.8 KiB
TypeScript

import { MongoDatasetTraining } from './schema';
import type {
PushDatasetDataChunkProps,
PushDatasetDataResponse
} from '@fastgpt/global/core/dataset/api.d';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { simpleText } from '@fastgpt/global/common/string/tools';
import { type ClientSession } from '../../../common/mongo';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
import { addLog } from '../../../common/system/log';
import { getCollectionWithDataset } from '../controller';
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
import { type PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
import { i18nT } from '../../../../web/i18n/utils';
import { getLLMMaxChunkSize } from '../../../../global/core/dataset/training/utils';
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
try {
await MongoDatasetTraining.updateMany(
{
teamId
},
{
lockTime: new Date('2999/5/5')
}
);
} catch (error) {}
};
export async function pushDataListToTrainingQueue({
teamId,
tmbId,
datasetId,
collectionId,
agentModel,
vectorModel,
vlmModel,
data,
billId,
mode = TrainingModeEnum.chunk,
indexSize,
session
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(i18nT('common:error_embedding_not_config'));
}
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(i18nT('common:error_llm_not_config'));
}
const { model, maxToken, weight } = await (async () => {
if (mode === TrainingModeEnum.chunk) {
return {
maxToken: Infinity,
model: vectorModelData.model,
weight: vectorModelData.weight
};
}
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
return {
maxToken: getLLMMaxChunkSize(agentModelData),
model: agentModelData.model,
weight: 0
};
}
if (mode === TrainingModeEnum.image || mode === TrainingModeEnum.imageParse) {
const vllmModelData = getVlmModel(vlmModel);
if (!vllmModelData) {
return Promise.reject(i18nT('common:error_vlm_not_config'));
}
return {
maxToken: getLLMMaxChunkSize(vllmModelData),
model: vllmModelData.model,
weight: 0
};
}
return Promise.reject(`Training mode "${mode}" is inValid`);
})();
// format q and a, remove empty char
data = data.filter((item) => {
const q = item.q || '';
const a = item.a || '';
// filter repeat content
if (!item.imageId && !q) {
return;
}
const text = q + a;
// Oversize llm tokens
if (text.length > maxToken) {
return;
}
return true;
});
// insert data to db
const insertLen = data.length;
// 使用 insertMany 批量插入
const batchSize = 500;
const insertData = async (startIndex: number, session: ClientSession) => {
const list = data.slice(startIndex, startIndex + batchSize);
if (list.length === 0) return;
try {
const result = await MongoDatasetTraining.insertMany(
list.map((item) => ({
teamId,
tmbId,
datasetId: datasetId,
collectionId: collectionId,
billId,
mode,
...(item.q && { q: item.q }),
...(item.a && { a: item.a }),
...(item.imageId && { imageId: item.imageId }),
chunkIndex: item.chunkIndex ?? 0,
indexSize,
weight: weight ?? 0,
indexes: item.indexes,
retryCount: 5
})),
{
session,
ordered: false,
rawResult: true,
includeResultMetadata: false // 进一步减少返回数据
}
);
if (result.insertedCount !== list.length) {
return Promise.reject(`Insert data error, ${JSON.stringify(result)}`);
}
} catch (error: any) {
addLog.error(`Insert error`, error);
return Promise.reject(error);
}
return insertData(startIndex + batchSize, session);
};
if (session) {
await insertData(0, session);
} else {
await mongoSessionRun(async (session) => {
await insertData(0, session);
});
}
return {
insertLen
};
}
export const pushDatasetToParseQueue = async ({
teamId,
tmbId,
datasetId,
collectionId,
billId,
session
}: {
teamId: string;
tmbId: string;
datasetId: string;
collectionId: string;
billId: string;
session: ClientSession;
}) => {
await MongoDatasetTraining.create(
[
{
teamId,
tmbId,
datasetId,
collectionId,
billId,
mode: TrainingModeEnum.parse
}
],
{ session, ordered: true }
);
};