diff --git a/docSite/content/zh-cn/docs/development/openapi/dataset.md b/docSite/content/zh-cn/docs/development/openapi/dataset.md index d43b2026d..e8cde92fc 100644 --- a/docSite/content/zh-cn/docs/development/openapi/dataset.md +++ b/docSite/content/zh-cn/docs/development/openapi/dataset.md @@ -11,8 +11,6 @@ weight: 853 | --------------------- | --------------------- | | ![](/imgs/getDatasetId.jpg) | ![](/imgs/getfile_id.webp) | - - ## 创建训练订单 {{< tabs tabTotal="2" >}} @@ -289,7 +287,7 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/delete? ## 集合 -### 通用创建参数说明 +### 通用创建参数说明(必看) **入参** @@ -300,8 +298,11 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/delete? | trainingType | 数据处理方式。chunk: 按文本长度进行分割;qa: 问答对提取 | ✅ | | autoIndexes | 是否自动生成索引(仅商业版支持) | | | imageIndex | 是否自动生成图片索引(仅商业版支持) | | -| chunkSize | 预估块大小 | | -| chunkSplitter | 自定义最高优先分割符号 | | +| chunkSettingMode | 分块参数模式。auto: 系统默认参数; custom: 手动指定参数 | | +| chunkSplitMode | 分块拆分模式。size: 按长度拆分; char: 按字符拆分。chunkSettingMode=auto时不生效。 | | +| chunkSize | 分块大小,默认 1500。chunkSettingMode=auto时不生效。 | | +| indexSize | 索引大小,默认 512,必须小于索引模型最大token。chunkSettingMode=auto时不生效。 | | +| chunkSplitter | 自定义最高优先分割符号,除非超出文件处理最大上下文,否则不会进行进一步拆分。chunkSettingMode=auto时不生效。 | | | qaPrompt | qa拆分提示词 | | | tags | 集合标签(字符串数组) | | | createTime | 文件创建时间(Date / String) | | @@ -389,9 +390,8 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio "name":"测试训练", "trainingType": "qa", - "chunkSize":8000, - "chunkSplitter":"", - "qaPrompt":"11", + "chunkSettingMode": "auto", + "qaPrompt":"", "metadata":{} }' @@ -409,10 +409,6 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio - parentId: 父级ID,不填则默认为根目录 - name: 集合名称(必填) - metadata: 元数据(暂时没啥用) -- trainingType: 训练模式(必填) -- chunkSize: 每个 chunk 的长度(可选). chunk模式:100~3000; qa模式: 4000~模型最大token(16k模型通常建议不超过10000) -- chunkSplitter: 自定义最高优先分割符号(可选) -- qaPrompt: qa拆分自定义提示词(可选) {{% /alert %}} {{< /markdownify >}} @@ -462,8 +458,7 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio "parentId": null, "trainingType": "chunk", - "chunkSize":512, - "chunkSplitter":"", + "chunkSettingMode": "auto", "qaPrompt":"", "metadata":{ @@ -483,10 +478,6 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio - datasetId: 知识库的ID(必填) - parentId: 父级ID,不填则默认为根目录 - metadata.webPageSelector: 网页选择器,用于指定网页中的哪个元素作为文本(可选) -- trainingType:训练模式(必填) -- chunkSize: 每个 chunk 的长度(可选). chunk模式:100~3000; qa模式: 4000~模型最大token(16k模型通常建议不超过10000) -- chunkSplitter: 自定义最高优先分割符号(可选) -- qaPrompt: qa拆分自定义提示词(可选) {{% /alert %}} {{< /markdownify >}} @@ -545,13 +536,7 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio {{% alert icon=" " context="success" %}} - file: 文件 -- data: 知识库相关信息(json序列化后传入) - - datasetId: 知识库的ID(必填) - - parentId: 父级ID,不填则默认为根目录 - - trainingType:训练模式(必填) - - chunkSize: 每个 chunk 的长度(可选). chunk模式:100~3000; qa模式: 4000~模型最大token(16k模型通常建议不超过10000) - - chunkSplitter: 自定义最高优先分割符号(可选) - - qaPrompt: qa拆分自定义提示词(可选) +- data: 知识库相关信息(json序列化后传入),参数说明见上方“通用创建参数说明” {{% /alert %}} {{< /markdownify >}} diff --git a/docSite/content/zh-cn/docs/development/upgrading/492.md b/docSite/content/zh-cn/docs/development/upgrading/492.md index ec006a73b..73d156129 100644 --- a/docSite/content/zh-cn/docs/development/upgrading/492.md +++ b/docSite/content/zh-cn/docs/development/upgrading/492.md @@ -7,12 +7,17 @@ toc: true weight: 799 --- +## 重要提示 + +- 知识库导入数据 API 变更,增加`chunkSettingMode`,`chunkSplitMode`,`indexSize`可选参数,具体可参考 [知识库导入数据 API](/docs/development/openapi/dataset) 文档。 + ## 🚀 新增内容 -1. 知识库分块增加自定义分隔符预设值,同时支持自定义换行符分割。 -2. 外部变量改名:自定义变量。 并且支持在测试时调试,在分享链接中,该变量直接隐藏。 -3. 集合同步时,支持同步修改标题。 +1. 知识库分块优化:支持单独配置分块大小和索引大小,允许进行超大分块,以更大的输入 Tokens 换取完整分块。 +2. 知识库分块增加自定义分隔符预设值,同时支持自定义换行符分割。 +3. 外部变量改名:自定义变量。 并且支持在测试时调试,在分享链接中,该变量直接隐藏。 +4. 集合同步时,支持同步修改标题。 ## ⚙️ 优化 diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts index 8c56029dd..c0b335af7 100644 --- a/packages/global/common/string/textSplitter.ts +++ b/packages/global/common/string/textSplitter.ts @@ -1,15 +1,17 @@ +import { defaultMaxChunkSize } from '../../core/dataset/training/utils'; import { getErrText } from '../error/utils'; export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----'; type SplitProps = { text: string; - chunkLen: number; + chunkSize: number; + maxSize?: number; overlapRatio?: number; customReg?: string[]; }; -export type TextSplitProps = Omit & { - chunkLen?: number; +export type TextSplitProps = Omit & { + chunkSize?: number; }; type SplitResponse = { @@ -55,7 +57,7 @@ const strIsMdTable = (str: string) => { return true; }; const markdownTableSplit = (props: SplitProps): SplitResponse => { - let { text = '', chunkLen } = props; + let { text = '', chunkSize } = props; const splitText2Lines = text.split('\n'); const header = splitText2Lines[0]; const headerSize = header.split('|').length - 2; @@ -71,7 +73,7 @@ ${mdSplitString} `; for (let i = 2; i < splitText2Lines.length; i++) { - if (chunk.length + splitText2Lines[i].length > chunkLen * 1.2) { + if (chunk.length + splitText2Lines[i].length > chunkSize * 1.2) { chunks.push(chunk); chunk = `${header} ${mdSplitString} @@ -98,11 +100,17 @@ ${mdSplitString} 5. 标点分割:重叠 */ const commonSplit = (props: SplitProps): SplitResponse => { - let { text = '', chunkLen, overlapRatio = 0.15, customReg = [] } = props; + let { + text = '', + chunkSize, + maxSize = defaultMaxChunkSize, + overlapRatio = 0.15, + customReg = [] + } = props; const splitMarker = 'SPLIT_HERE_SPLIT_HERE'; const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER'; - const overlapLen = Math.round(chunkLen * overlapRatio); + const overlapLen = Math.round(chunkSize * overlapRatio); // replace code block all \n to codeBlockMarker text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) { @@ -118,24 +126,24 @@ const commonSplit = (props: SplitProps): SplitResponse => { const stepReges: { reg: RegExp | string; maxLen: number }[] = [ ...customReg.map((text) => ({ reg: text.replaceAll('\\n', '\n'), - maxLen: chunkLen * 1.4 + maxLen: chunkSize })), - { reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkLen * 1.2 }, - { reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 }, - { reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 }, - { reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 }, - { reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 }, + { reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkSize }, + { reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkSize }, + { reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkSize }, + { reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize }, + { reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize }, - { reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block - { reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char - { reg: /(\n{2,})/g, maxLen: chunkLen * 1.6 }, - { reg: /([\n])/g, maxLen: chunkLen * 1.2 }, + { reg: /([\n]([`~]))/g, maxLen: chunkSize }, // code block + { reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkSize }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char + { reg: /(\n{2,})/g, maxLen: chunkSize }, + { reg: /([\n])/g, maxLen: chunkSize }, // ------ There's no overlap on the top - { reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 }, - { reg: /([!]|!\s)/g, maxLen: chunkLen * 1.2 }, - { reg: /([?]|\?\s)/g, maxLen: chunkLen * 1.4 }, - { reg: /([;]|;\s)/g, maxLen: chunkLen * 1.6 }, - { reg: /([,]|,\s)/g, maxLen: chunkLen * 2 } + { reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkSize }, + { reg: /([!]|!\s)/g, maxLen: chunkSize }, + { reg: /([?]|\?\s)/g, maxLen: chunkSize }, + { reg: /([;]|;\s)/g, maxLen: chunkSize }, + { reg: /([,]|,\s)/g, maxLen: chunkSize } ]; const customRegLen = customReg.length; @@ -203,7 +211,7 @@ const commonSplit = (props: SplitProps): SplitResponse => { /* Gets the overlap at the end of a text as the beginning of the next block */ const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => { const forbidOverlap = checkForbidOverlap(step); - const maxOverlapLen = chunkLen * 0.4; + const maxOverlapLen = chunkSize * 0.4; // step >= stepReges.length: Do not overlap incomplete sentences if (forbidOverlap || overlapLen === 0 || step >= stepReges.length) return ''; @@ -246,13 +254,13 @@ const commonSplit = (props: SplitProps): SplitResponse => { // oversize if (step >= stepReges.length) { - if (text.length < chunkLen * 3) { + if (text.length < chunkSize * 3) { return [text]; } - // use slice-chunkLen to split text + // use slice-chunkSize to split text const chunks: string[] = []; - for (let i = 0; i < text.length; i += chunkLen - overlapLen) { - chunks.push(text.slice(i, i + chunkLen)); + for (let i = 0; i < text.length; i += chunkSize - overlapLen) { + chunks.push(text.slice(i, i + chunkSize)); } return chunks; } @@ -260,8 +268,8 @@ const commonSplit = (props: SplitProps): SplitResponse => { // split text by special char const splitTexts = getSplitTexts({ text, step }); - const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen; - const minChunkLen = chunkLen * 0.7; + const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkSize; + const minChunkLen = chunkSize * 0.7; const chunks: string[] = []; for (let i = 0; i < splitTexts.length; i++) { @@ -297,7 +305,7 @@ const commonSplit = (props: SplitProps): SplitResponse => { continue; } - // newText is too large(now, The lastText must be smaller than chunkLen) + // newText is too large(now, The lastText must be smaller than chunkSize) if (newTextLen > maxLen) { // lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText) if (lastTextLen > minChunkLen) { @@ -352,7 +360,7 @@ const commonSplit = (props: SplitProps): SplitResponse => { /* If the last chunk is independent, it needs to be push chunks. */ if (lastText && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastText)) { - if (lastText.length < chunkLen * 0.4) { + if (lastText.length < chunkSize * 0.4) { chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText; } else { chunks.push(lastText); @@ -386,9 +394,9 @@ const commonSplit = (props: SplitProps): SplitResponse => { /** * text split into chunks - * chunkLen - one chunk len. max: 3500 + * chunkSize - one chunk len. max: 3500 * overlapLen - The size of the before and after Text - * chunkLen > overlapLen + * chunkSize > overlapLen * markdown */ export const splitText2Chunks = (props: SplitProps): SplitResponse => { diff --git a/packages/global/core/dataset/api.d.ts b/packages/global/core/dataset/api.d.ts index 99b4aaa3a..40e696b2d 100644 --- a/packages/global/core/dataset/api.d.ts +++ b/packages/global/core/dataset/api.d.ts @@ -1,5 +1,10 @@ import { DatasetDataIndexItemType, DatasetSchemaType } from './type'; -import { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum } from './constants'; +import { + DatasetCollectionTypeEnum, + DatasetCollectionDataProcessModeEnum, + ChunkSettingModeEnum, + DataChunkSplitModeEnum +} from './constants'; import type { LLMModelItemType } from '../ai/model.d'; import { ParentIdType } from 'common/parentFolder/type'; @@ -33,7 +38,13 @@ export type DatasetCollectionChunkMetadataType = { trainingType?: DatasetCollectionDataProcessModeEnum; imageIndex?: boolean; autoIndexes?: boolean; + + chunkSettingMode?: ChunkSettingModeEnum; + chunkSplitMode?: DataChunkSplitModeEnum; + chunkSize?: number; + indexSize?: number; + chunkSplitter?: string; qaPrompt?: string; metadata?: Record; diff --git a/packages/global/core/dataset/constants.ts b/packages/global/core/dataset/constants.ts index 81f52a4fe..627129835 100644 --- a/packages/global/core/dataset/constants.ts +++ b/packages/global/core/dataset/constants.ts @@ -129,6 +129,16 @@ export const DatasetCollectionDataProcessModeMap = { } }; +export enum ChunkSettingModeEnum { + auto = 'auto', + custom = 'custom' +} + +export enum DataChunkSplitModeEnum { + size = 'size', + char = 'char' +} + /* ------------ data -------------- */ /* ------------ training -------------- */ diff --git a/packages/global/core/dataset/controller.d.ts b/packages/global/core/dataset/controller.d.ts index 2382a94e6..7a90ae5eb 100644 --- a/packages/global/core/dataset/controller.d.ts +++ b/packages/global/core/dataset/controller.d.ts @@ -13,6 +13,7 @@ export type CreateDatasetDataProps = { export type UpdateDatasetDataProps = { dataId: string; + q?: string; a?: string; indexes?: (Omit & { diff --git a/packages/global/core/dataset/training/type.d.ts b/packages/global/core/dataset/training/type.d.ts index 1bc15ea22..3404f1dc4 100644 --- a/packages/global/core/dataset/training/type.d.ts +++ b/packages/global/core/dataset/training/type.d.ts @@ -15,6 +15,8 @@ export type PushDataToTrainingQueueProps = { vectorModel: string; vlmModel?: string; + indexSize?: number; + billId?: string; session?: ClientSession; }; diff --git a/packages/global/core/dataset/training/utils.ts b/packages/global/core/dataset/training/utils.ts new file mode 100644 index 000000000..895837abb --- /dev/null +++ b/packages/global/core/dataset/training/utils.ts @@ -0,0 +1,136 @@ +import { EmbeddingModelItemType, LLMModelItemType } from '../../../core/ai/model.d'; +import { + ChunkSettingModeEnum, + DataChunkSplitModeEnum, + DatasetCollectionDataProcessModeEnum +} from '../constants'; + +export const minChunkSize = 64; // min index and chunk size + +// Chunk size +export const chunkAutoChunkSize = 1500; +export const getMaxChunkSize = (model: LLMModelItemType) => { + return Math.max(model.maxContext - model.maxResponse, 2000); +}; + +// QA +export const defaultMaxChunkSize = 8000; +export const getLLMDefaultChunkSize = (model?: LLMModelItemType) => { + if (!model) return defaultMaxChunkSize; + return Math.max(Math.min(model.maxContext - model.maxResponse, defaultMaxChunkSize), 2000); +}; + +export const getLLMMaxChunkSize = (model?: LLMModelItemType) => { + if (!model) return 8000; + return Math.max(model.maxContext - model.maxResponse, 2000); +}; + +// Index size +export const getMaxIndexSize = (model?: EmbeddingModelItemType) => { + return model?.maxToken || 512; +}; +export const getAutoIndexSize = (model?: EmbeddingModelItemType) => { + return model?.defaultToken || 512; +}; + +const indexSizeSelectList = [ + { + label: '64', + value: 64 + }, + { + label: '128', + value: 128 + }, + { + label: '256', + value: 256 + }, + { + label: '512', + value: 512 + }, + { + label: '768', + value: 768 + }, + { + label: '1024', + value: 1024 + }, + { + label: '1536', + value: 1536 + }, + { + label: '2048', + value: 2048 + }, + { + label: '3072', + value: 3072 + }, + { + label: '4096', + value: 4096 + }, + { + label: '5120', + value: 5120 + }, + { + label: '6144', + value: 6144 + }, + { + label: '7168', + value: 7168 + }, + { + label: '8192', + value: 8192 + } +]; +export const getIndexSizeSelectList = (max = 512) => { + return indexSizeSelectList.filter((item) => item.value <= max); +}; + +// Compute +export const computeChunkSize = (params: { + trainingType: DatasetCollectionDataProcessModeEnum; + chunkSettingMode?: ChunkSettingModeEnum; + chunkSplitMode?: DataChunkSplitModeEnum; + llmModel?: LLMModelItemType; + chunkSize?: number; +}) => { + if (params.trainingType === DatasetCollectionDataProcessModeEnum.qa) { + if (params.chunkSettingMode === ChunkSettingModeEnum.auto) { + return getLLMDefaultChunkSize(params.llmModel); + } + } else { + // chunk + if (params.chunkSettingMode === ChunkSettingModeEnum.auto) { + return chunkAutoChunkSize; + } + } + + if (params.chunkSplitMode === DataChunkSplitModeEnum.char) { + return getLLMMaxChunkSize(params.llmModel); + } + + return Math.min(params.chunkSize || chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel)); +}; + +export const computeChunkSplitter = (params: { + chunkSettingMode?: ChunkSettingModeEnum; + chunkSplitMode?: DataChunkSplitModeEnum; + chunkSplitter?: string; +}) => { + if (params.chunkSettingMode === ChunkSettingModeEnum.auto) { + return undefined; + } + if (params.chunkSplitMode === DataChunkSplitModeEnum.size) { + return undefined; + } + return params.chunkSplitter; +}; diff --git a/packages/global/core/dataset/type.d.ts b/packages/global/core/dataset/type.d.ts index c9b0aa899..a92785b94 100644 --- a/packages/global/core/dataset/type.d.ts +++ b/packages/global/core/dataset/type.d.ts @@ -2,6 +2,7 @@ import type { LLMModelItemType, EmbeddingModelItemType } from '../../core/ai/mod import { PermissionTypeEnum } from '../../support/permission/constant'; import { PushDatasetDataChunkProps } from './api'; import { + DataChunkSplitModeEnum, DatasetCollectionDataProcessModeEnum, DatasetCollectionTypeEnum, DatasetStatusEnum, @@ -14,6 +15,7 @@ import { Permission } from '../../support/permission/controller'; import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset'; import { SourceMemberType } from 'support/user/type'; import { DatasetDataIndexTypeEnum } from './data/constants'; +import { ChunkSettingModeEnum } from './constants'; export type DatasetSchemaType = { _id: string; @@ -88,7 +90,12 @@ export type DatasetCollectionSchemaType = { autoIndexes?: boolean; imageIndex?: boolean; trainingType: DatasetCollectionDataProcessModeEnum; - chunkSize: number; + + chunkSettingMode?: ChunkSettingModeEnum; + chunkSplitMode?: DataChunkSplitModeEnum; + + chunkSize?: number; + indexSize?: number; chunkSplitter?: string; qaPrompt?: string; }; diff --git a/packages/global/core/dataset/utils.ts b/packages/global/core/dataset/utils.ts index 64c330c84..803dce59b 100644 --- a/packages/global/core/dataset/utils.ts +++ b/packages/global/core/dataset/utils.ts @@ -1,7 +1,6 @@ import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants'; import { getFileIcon } from '../../common/file/icon'; import { strIsLink } from '../../common/string/tools'; -import { DatasetDataIndexTypeEnum } from './data/constants'; export function getCollectionIcon( type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file, @@ -38,26 +37,6 @@ export function getSourceNameIcon({ return 'file/fill/file'; } -/* get dataset data default index */ -export function getDefaultIndex(props?: { q?: string; a?: string }) { - const { q = '', a } = props || {}; - - return [ - { - text: q, - type: DatasetDataIndexTypeEnum.default - }, - ...(a - ? [ - { - text: a, - type: DatasetDataIndexTypeEnum.default - } - ] - : []) - ]; -} - export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => { if (mode === TrainingModeEnum.qa) return data.length * 20; if (mode === TrainingModeEnum.auto) return data.length * 5; diff --git a/packages/service/core/dataset/collection/controller.ts b/packages/service/core/dataset/collection/controller.ts index 0dfcc6152..44e5d07da 100644 --- a/packages/service/core/dataset/collection/controller.ts +++ b/packages/service/core/dataset/collection/controller.ts @@ -27,6 +27,11 @@ import { addDays } from 'date-fns'; import { MongoDatasetDataText } from '../data/dataTextSchema'; import { retryFn } from '@fastgpt/global/common/system/utils'; import { getTrainingModeByCollection } from './utils'; +import { + computeChunkSize, + computeChunkSplitter, + getLLMMaxChunkSize +} from '@fastgpt/global/core/dataset/training/utils'; export const createCollectionAndInsertData = async ({ dataset, @@ -54,18 +59,22 @@ export const createCollectionAndInsertData = async ({ const teamId = createCollectionParams.teamId; const tmbId = createCollectionParams.tmbId; - // Chunk split params + + // Set default params const trainingType = createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk; - const chunkSize = createCollectionParams.chunkSize || 512; - const chunkSplitter = createCollectionParams.chunkSplitter; - const qaPrompt = createCollectionParams.qaPrompt; - const usageName = createCollectionParams.name; + const chunkSize = computeChunkSize({ + ...createCollectionParams, + trainingType, + llmModel: getLLMModel(dataset.agentModel) + }); + const chunkSplitter = computeChunkSplitter(createCollectionParams); // 1. split chunks const chunks = rawText2Chunks({ rawText, - chunkLen: chunkSize, + chunkSize, + maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)), overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0, customReg: chunkSplitter ? [chunkSplitter] : [], isQAImport @@ -76,7 +85,7 @@ export const createCollectionAndInsertData = async ({ teamId, insertLen: predictDataLimitLength( getTrainingModeByCollection({ - trainingType, + trainingType: trainingType, autoIndexes: createCollectionParams.autoIndexes, imageIndex: createCollectionParams.imageIndex }), @@ -88,6 +97,9 @@ export const createCollectionAndInsertData = async ({ // 3. create collection const { _id: collectionId } = await createOneCollection({ ...createCollectionParams, + trainingType, + chunkSize, + chunkSplitter, hashRawText: hashStr(rawText), rawTextLength: rawText.length, @@ -111,7 +123,7 @@ export const createCollectionAndInsertData = async ({ const { billId: newBillId } = await createTrainingUsage({ teamId, tmbId, - appName: usageName, + appName: createCollectionParams.name, billSource: UsageSourceEnum.training, vectorModel: getEmbeddingModel(dataset.vectorModel)?.name, agentModel: getLLMModel(dataset.agentModel)?.name, @@ -130,12 +142,13 @@ export const createCollectionAndInsertData = async ({ agentModel: dataset.agentModel, vectorModel: dataset.vectorModel, vlmModel: dataset.vlmModel, + indexSize: createCollectionParams.indexSize, mode: getTrainingModeByCollection({ - trainingType, + trainingType: trainingType, autoIndexes: createCollectionParams.autoIndexes, imageIndex: createCollectionParams.imageIndex }), - prompt: qaPrompt, + prompt: createCollectionParams.qaPrompt, billId: traingBillId, data: chunks.map((item, index) => ({ ...item, @@ -207,11 +220,14 @@ export async function createOneCollection({ // Parse settings customPdfParse, imageIndex, + autoIndexes, // Chunk settings - trainingType = DatasetCollectionDataProcessModeEnum.chunk, - autoIndexes, - chunkSize = 512, + trainingType, + chunkSettingMode, + chunkSplitMode, + chunkSize, + indexSize, chunkSplitter, qaPrompt, @@ -249,11 +265,14 @@ export async function createOneCollection({ // Parse settings customPdfParse, imageIndex, + autoIndexes, // Chunk settings trainingType, - autoIndexes, + chunkSettingMode, + chunkSplitMode, chunkSize, + indexSize, chunkSplitter, qaPrompt } diff --git a/packages/service/core/dataset/collection/schema.ts b/packages/service/core/dataset/collection/schema.ts index 7e1686f95..9522c69f2 100644 --- a/packages/service/core/dataset/collection/schema.ts +++ b/packages/service/core/dataset/collection/schema.ts @@ -3,7 +3,9 @@ const { Schema, model, models } = connectionMongo; import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d'; import { DatasetCollectionTypeMap, - DatasetCollectionDataProcessModeEnum + DatasetCollectionDataProcessModeEnum, + ChunkSettingModeEnum, + DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants'; import { DatasetCollectionName } from '../schema'; import { @@ -94,11 +96,18 @@ const DatasetCollectionSchema = new Schema({ type: String, enum: Object.values(DatasetCollectionDataProcessModeEnum) }, - chunkSize: { - type: Number, - required: true + chunkSettingMode: { + type: String, + enum: Object.values(ChunkSettingModeEnum) }, + chunkSplitMode: { + type: String, + enum: Object.values(DataChunkSplitModeEnum) + }, + chunkSize: Number, chunkSplitter: String, + + indexSize: Number, qaPrompt: String }); diff --git a/packages/service/core/dataset/read.ts b/packages/service/core/dataset/read.ts index 730e0dc6c..448396cc3 100644 --- a/packages/service/core/dataset/read.ts +++ b/packages/service/core/dataset/read.ts @@ -185,7 +185,7 @@ export const readApiServerFileContent = async ({ export const rawText2Chunks = ({ rawText, isQAImport, - chunkLen = 512, + chunkSize = 512, ...splitProps }: { rawText: string; @@ -198,7 +198,7 @@ export const rawText2Chunks = ({ const { chunks } = splitText2Chunks({ text: rawText, - chunkLen, + chunkSize, ...splitProps }); diff --git a/packages/service/core/dataset/training/controller.ts b/packages/service/core/dataset/training/controller.ts index d740eec55..fb5ceacd8 100644 --- a/packages/service/core/dataset/training/controller.ts +++ b/packages/service/core/dataset/training/controller.ts @@ -12,6 +12,10 @@ import { getCollectionWithDataset } from '../controller'; import { mongoSessionRun } from '../../../common/mongo/sessionRun'; import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type'; import { i18nT } from '../../../../web/i18n/utils'; +import { + getLLMDefaultChunkSize, + getLLMMaxChunkSize +} from '../../../../global/core/dataset/training/utils'; export const lockTrainingDataByTeamId = async (teamId: string): Promise => { try { @@ -55,6 +59,7 @@ export async function pushDataListToTrainingQueue({ prompt, billId, mode = TrainingModeEnum.chunk, + indexSize, session }: PushDataToTrainingQueueProps): Promise { const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => { @@ -68,38 +73,41 @@ export async function pushDataListToTrainingQueue({ } return mode; }; + + const vectorModelData = getEmbeddingModel(vectorModel); + if (!vectorModelData) { + return Promise.reject(i18nT('common:error_embedding_not_config')); + } + const agentModelData = getLLMModel(agentModel); + if (!agentModelData) { + return Promise.reject(i18nT('common:error_llm_not_config')); + } + if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) { + prompt = undefined; + } + const { model, maxToken, weight } = await (async () => { if (mode === TrainingModeEnum.chunk) { - const vectorModelData = getEmbeddingModel(vectorModel); - if (!vectorModelData) { - return Promise.reject(i18nT('common:error_embedding_not_config')); - } return { - maxToken: vectorModelData.maxToken * 1.5, + maxToken: getLLMMaxChunkSize(agentModelData), model: vectorModelData.model, weight: vectorModelData.weight }; } - if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) { - const agentModelData = getLLMModel(agentModel); - if (!agentModelData) { - return Promise.reject(i18nT('common:error_llm_not_config')); - } return { - maxToken: agentModelData.maxContext * 0.8, + maxToken: getLLMMaxChunkSize(agentModelData), model: agentModelData.model, weight: 0 }; } - if (mode === TrainingModeEnum.image) { const vllmModelData = getVlmModel(vlmModel); if (!vllmModelData) { return Promise.reject(i18nT('common:error_vlm_not_config')); } return { - maxToken: vllmModelData.maxContext * 0.8, + maxToken: getLLMMaxChunkSize(vllmModelData), model: vllmModelData.model, weight: 0 }; @@ -107,10 +115,6 @@ export async function pushDataListToTrainingQueue({ return Promise.reject(`Training mode "${mode}" is inValid`); })(); - // Filter redundant params - if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) { - prompt = undefined; - } // filter repeat or equal content const set = new Set(); @@ -143,13 +147,13 @@ export async function pushDataListToTrainingQueue({ const text = item.q + item.a; + // Oversize llm tokens if (text.length > maxToken) { filterResult.overToken.push(item); return; } if (set.has(text)) { - console.log('repeat', item); filterResult.repeat.push(item); } else { filterResult.success.push(item); @@ -182,6 +186,7 @@ export async function pushDataListToTrainingQueue({ q: item.q, a: item.a, chunkIndex: item.chunkIndex ?? 0, + indexSize, weight: weight ?? 0, indexes: item.indexes, retryCount: 5 diff --git a/packages/service/core/dataset/training/schema.ts b/packages/service/core/dataset/training/schema.ts index 34044674d..d11d2e109 100644 --- a/packages/service/core/dataset/training/schema.ts +++ b/packages/service/core/dataset/training/schema.ts @@ -76,6 +76,7 @@ const TrainingDataSchema = new Schema({ type: Number, default: 0 }, + indexSize: Number, weight: { type: Number, default: 0 diff --git a/packages/web/components/common/MyModal/EditFolderModal.tsx b/packages/web/components/common/MyModal/EditFolderModal.tsx index bc8dfdda9..4647066c6 100644 --- a/packages/web/components/common/MyModal/EditFolderModal.tsx +++ b/packages/web/components/common/MyModal/EditFolderModal.tsx @@ -72,7 +72,7 @@ const EditFolderModal = ({ {...register('name', { required: true })} bg={'myGray.50'} autoFocus - maxLength={20} + maxLength={100} /> diff --git a/packages/web/components/common/Radio/RadioGroup.tsx b/packages/web/components/common/Radio/RadioGroup.tsx new file mode 100644 index 000000000..1443a09dd --- /dev/null +++ b/packages/web/components/common/Radio/RadioGroup.tsx @@ -0,0 +1,67 @@ +import React from 'react'; +import { Box, Flex, Grid, type GridProps, HStack } from '@chakra-ui/react'; +import { useTranslation } from 'next-i18next'; +import QuestionTip from '../MyTooltip/QuestionTip'; + +type Props = Omit & { + list: { + title: string; + value: T; + tooltip?: string; + }[]; + value: T; + defaultBg?: string; + activeBg?: string; + onChange: (e: T) => void; +}; + +const RadioGroup = ({ list, value, onChange, ...props }: Props) => { + const { t } = useTranslation(); + + return ( + + {list.map((item) => ( + onChange(item.value)} + > + + + + + + + {typeof item.title === 'string' ? t(item.title as any) : item.title} + {!!item.tooltip && } + + + ))} + + ); +}; + +export default RadioGroup; diff --git a/packages/web/i18n/en/common.json b/packages/web/i18n/en/common.json index 36e22c152..5581d22e2 100644 --- a/packages/web/i18n/en/common.json +++ b/packages/web/i18n/en/common.json @@ -569,7 +569,6 @@ "core.dataset.import.Custom process": "Custom Rules", "core.dataset.import.Custom process desc": "Customize segmentation and preprocessing rules", "core.dataset.import.Custom prompt": "Custom Prompt", - "core.dataset.import.Custom split char": "Custom Separator", "core.dataset.import.Custom text": "Custom Text", "core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset", "core.dataset.import.Data process params": "Data Processing Parameters", diff --git a/packages/web/i18n/en/dataset.json b/packages/web/i18n/en/dataset.json index d894d21c8..67a961810 100644 --- a/packages/web/i18n/en/dataset.json +++ b/packages/web/i18n/en/dataset.json @@ -27,7 +27,6 @@ "custom_data_process_params": "Custom", "custom_data_process_params_desc": "Customize data processing rules", "custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.", - "data.ideal_chunk_length": "ideal block length", "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes", "data_index_num": "Index {{index}}", "data_process_params": "Params", @@ -53,8 +52,6 @@ "file_model_function_tip": "Enhances indexing and QA generation", "filename": "Filename", "folder_dataset": "Folder", - "ideal_chunk_length": "ideal block length", - "ideal_chunk_length_tips": "Segment according to the end symbol and combine multiple segments into one block. This value determines the estimated size of the block, if there is any fluctuation.", "image_auto_parse": "Automatic image indexing", "image_auto_parse_tips": "Call VLM to automatically label the pictures in the document and generate additional search indexes", "image_training_queue": "Queue of image processing", @@ -68,6 +65,8 @@ "import_param_setting": "Parameter settings", "import_select_file": "Select a file", "import_select_link": "Enter link", + "index_size": "Index size", + "index_size_tips": "When vectorized, the system will automatically further segment the blocks according to this size.", "is_open_schedule": "Enable scheduled synchronization", "keep_image": "Keep the picture", "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.", @@ -89,6 +88,8 @@ "retain_collection": "Adjust Training Parameters", "retrain_task_submitted": "The retraining task has been submitted", "same_api_collection": "The same API set exists", + "split_chunk_char": "Block by specified splitter", + "split_chunk_size": "Block by length", "split_sign_break": "1 newline character", "split_sign_break2": "2 newline characters", "split_sign_custom": "Customize", diff --git a/packages/web/i18n/zh-CN/common.json b/packages/web/i18n/zh-CN/common.json index 0e6767d34..c7068843b 100644 --- a/packages/web/i18n/zh-CN/common.json +++ b/packages/web/i18n/zh-CN/common.json @@ -573,7 +573,6 @@ "core.dataset.import.Custom process": "自定义规则", "core.dataset.import.Custom process desc": "自定义设置数据处理规则", "core.dataset.import.Custom prompt": "自定义提示词", - "core.dataset.import.Custom split char": "自定义分隔符", "core.dataset.import.Custom text": "自定义文本", "core.dataset.import.Custom text desc": "手动输入一段文本作为数据集", "core.dataset.import.Data process params": "数据处理参数", diff --git a/packages/web/i18n/zh-CN/dataset.json b/packages/web/i18n/zh-CN/dataset.json index bc4915917..f8af7612b 100644 --- a/packages/web/i18n/zh-CN/dataset.json +++ b/packages/web/i18n/zh-CN/dataset.json @@ -27,7 +27,6 @@ "custom_data_process_params": "自定义", "custom_data_process_params_desc": "自定义设置数据处理规则", "custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号,例如: * () [] {} 等。", - "data.ideal_chunk_length": "理想分块长度", "data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引", "data_index_num": "索引 {{index}}", "data_process_params": "处理参数", @@ -53,8 +52,6 @@ "file_model_function_tip": "用于增强索引和 QA 生成", "filename": "文件名", "folder_dataset": "文件夹", - "ideal_chunk_length": "理想分块长度", - "ideal_chunk_length_tips": "按结束符号进行分段,并将多个分段组成一个分块,该值决定了分块的预估大小,如果会有上下浮动。", "image_auto_parse": "图片自动索引", "image_auto_parse_tips": "调用 VLM 自动标注文档里的图片,并生成额外的检索索引", "image_training_queue": "图片处理排队", @@ -68,6 +65,8 @@ "import_param_setting": "参数设置", "import_select_file": "选择文件", "import_select_link": "输入链接", + "index_size": "索引大小", + "index_size_tips": "向量化时内容的长度,系统会自动按该大小对分块进行进一步的分割。", "is_open_schedule": "启用定时同步", "keep_image": "保留图片", "move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。", @@ -89,6 +88,8 @@ "retain_collection": "调整训练参数", "retrain_task_submitted": "重新训练任务已提交", "same_api_collection": "存在相同的 API 集合", + "split_chunk_char": "按指定分割符分块", + "split_chunk_size": "按长度分块", "split_sign_break": "1 个换行符", "split_sign_break2": "2 个换行符", "split_sign_custom": "自定义", diff --git a/packages/web/i18n/zh-Hant/common.json b/packages/web/i18n/zh-Hant/common.json index 14e12b5e6..2d3336cc7 100644 --- a/packages/web/i18n/zh-Hant/common.json +++ b/packages/web/i18n/zh-Hant/common.json @@ -568,7 +568,6 @@ "core.dataset.import.Custom process": "自訂規則", "core.dataset.import.Custom process desc": "自訂設定資料處理規則", "core.dataset.import.Custom prompt": "自訂提示詞", - "core.dataset.import.Custom split char": "自訂分隔符", "core.dataset.import.Custom text": "自訂文字", "core.dataset.import.Custom text desc": "手動輸入一段文字作為資料集", "core.dataset.import.Data process params": "資料處理參數", diff --git a/packages/web/i18n/zh-Hant/dataset.json b/packages/web/i18n/zh-Hant/dataset.json index d1c3404f7..1f687b594 100644 --- a/packages/web/i18n/zh-Hant/dataset.json +++ b/packages/web/i18n/zh-Hant/dataset.json @@ -27,7 +27,6 @@ "custom_data_process_params": "自訂", "custom_data_process_params_desc": "自訂資料處理規則", "custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的數據,使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.” 表示中英文句號。\n\n盡量避免使用正則相關特殊符號,例如: * () [] {} 等。", - "data.ideal_chunk_length": "理想分塊長度", "data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引", "data_index_num": "索引 {{index}}", "data_process_params": "處理參數", @@ -53,8 +52,6 @@ "file_model_function_tip": "用於增強索引和問答生成", "filename": "檔案名稱", "folder_dataset": "資料夾", - "ideal_chunk_length": "理想分塊長度", - "ideal_chunk_length_tips": "依結束符號進行分段,並將多個分段組成一個分塊,此值決定了分塊的預估大小,可能會有上下浮動。", "image_auto_parse": "圖片自動索引", "image_auto_parse_tips": "調用 VLM 自動標註文檔裡的圖片,並生成額外的檢索索引", "image_training_queue": "圖片處理排隊", @@ -68,6 +65,8 @@ "import_param_setting": "參數設置", "import_select_file": "選擇文件", "import_select_link": "輸入鏈接", + "index_size": "索引大小", + "index_size_tips": "向量化時內容的長度,系統會自動按該大小對分塊進行進一步的分割。", "is_open_schedule": "啟用定時同步", "keep_image": "保留圖片", "move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。", @@ -89,6 +88,8 @@ "retain_collection": "調整訓練參數", "retrain_task_submitted": "重新訓練任務已提交", "same_api_collection": "存在相同的 API 集合", + "split_chunk_char": "按指定分割符分塊", + "split_chunk_size": "按長度分塊", "split_sign_break": "1 個換行符", "split_sign_break2": "2 個換行符", "split_sign_custom": "自定義", diff --git a/projects/app/src/components/common/Modal/EditResourceModal.tsx b/projects/app/src/components/common/Modal/EditResourceModal.tsx index 5fd5e0f4f..6236d8491 100644 --- a/projects/app/src/components/common/Modal/EditResourceModal.tsx +++ b/projects/app/src/components/common/Modal/EditResourceModal.tsx @@ -71,7 +71,7 @@ const EditResourceModal = ({ {...register('name', { required: true })} bg={'myGray.50'} autoFocus - maxLength={20} + maxLength={100} /> diff --git a/projects/app/src/components/support/apikey/Table.tsx b/projects/app/src/components/support/apikey/Table.tsx index 30574b55b..96561c2b0 100644 --- a/projects/app/src/components/support/apikey/Table.tsx +++ b/projects/app/src/components/support/apikey/Table.tsx @@ -338,7 +338,7 @@ function EditKeyModal({ {t('common:Name')} {t('common:Name')} diff --git a/projects/app/src/pageComponents/app/detail/WorkflowComponents/Flow/nodes/render/NodeCard.tsx b/projects/app/src/pageComponents/app/detail/WorkflowComponents/Flow/nodes/render/NodeCard.tsx index b7feff68f..f461ef888 100644 --- a/projects/app/src/pageComponents/app/detail/WorkflowComponents/Flow/nodes/render/NodeCard.tsx +++ b/projects/app/src/pageComponents/app/detail/WorkflowComponents/Flow/nodes/render/NodeCard.tsx @@ -418,7 +418,7 @@ const NodeCard = (props: Props) => { {RenderToolHandle} - + ); }; diff --git a/projects/app/src/pageComponents/app/list/TemplateMarketModal.tsx b/projects/app/src/pageComponents/app/list/TemplateMarketModal.tsx index fbcd1e2b2..9b2553ad3 100644 --- a/projects/app/src/pageComponents/app/list/TemplateMarketModal.tsx +++ b/projects/app/src/pageComponents/app/list/TemplateMarketModal.tsx @@ -319,7 +319,7 @@ const TemplateMarketModal = ({ onChange={(e) => setCurrentSearch(e.target.value)} h={8} bg={'myGray.50'} - maxLength={20} + maxLength={100} borderRadius={'sm'} /> diff --git a/projects/app/src/pageComponents/dataset/EditFolderModal.tsx b/projects/app/src/pageComponents/dataset/EditFolderModal.tsx index 961e01f41..a17a51227 100644 --- a/projects/app/src/pageComponents/dataset/EditFolderModal.tsx +++ b/projects/app/src/pageComponents/dataset/EditFolderModal.tsx @@ -49,7 +49,7 @@ const EditFolderModal = ({ defaultValue={name} placeholder={t('common:dataset.Folder Name') || ''} autoFocus - maxLength={20} + maxLength={100} /> diff --git a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx index d437f39c1..29860701b 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx @@ -10,11 +10,21 @@ import { useMyStep } from '@fastgpt/web/hooks/useStep'; import { Box, Button, Flex, IconButton } from '@chakra-ui/react'; import MyIcon from '@fastgpt/web/components/common/Icon'; import { TabEnum } from '../NavBar'; -import { ChunkSettingModeEnum } from '@/web/core/dataset/constants'; +import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { UseFormReturn, useForm } from 'react-hook-form'; import { ImportSourceItemType } from '@/web/core/dataset/type'; import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; +import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants'; +import { + getMaxChunkSize, + getLLMDefaultChunkSize, + getLLMMaxChunkSize, + chunkAutoChunkSize, + minChunkSize, + getAutoIndexSize, + getMaxIndexSize +} from '@fastgpt/global/core/dataset/training/utils'; type TrainingFiledType = { chunkOverlapRatio: number; @@ -22,6 +32,9 @@ type TrainingFiledType = { minChunkSize: number; autoChunkSize: number; chunkSize: number; + maxIndexSize?: number; + indexSize?: number; + autoIndexSize?: number; charsPointsPrice: number; priceTip: string; uploadRate: number; @@ -47,9 +60,13 @@ export type ImportFormType = { autoIndexes: boolean; chunkSettingMode: ChunkSettingModeEnum; + + chunkSplitMode: DataChunkSplitModeEnum; embeddingChunkSize: number; qaChunkSize: number; - customSplitChar: string; + chunkSplitter: string; + indexSize: number; + qaPrompt: string; webSelector: string; }; @@ -199,9 +216,12 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode trainingType: DatasetCollectionDataProcessModeEnum.chunk, chunkSettingMode: ChunkSettingModeEnum.auto, - embeddingChunkSize: vectorModel?.defaultToken || 512, - qaChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7), - customSplitChar: '', + + chunkSplitMode: DataChunkSplitModeEnum.size, + embeddingChunkSize: 2000, + indexSize: vectorModel?.defaultToken || 512, + qaChunkSize: getLLMDefaultChunkSize(agentModel), + chunkSplitter: '', qaPrompt: Prompt_AgentQA.description, webSelector: '', customPdfParse: false @@ -215,17 +235,18 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode const chunkSettingMode = processParamsForm.watch('chunkSettingMode'); const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize'); const qaChunkSize = processParamsForm.watch('qaChunkSize'); - const customSplitChar = processParamsForm.watch('customSplitChar'); + const chunkSplitter = processParamsForm.watch('chunkSplitter'); const autoIndexes = processParamsForm.watch('autoIndexes'); + const indexSize = processParamsForm.watch('indexSize'); const TrainingModeMap = useMemo(() => { if (trainingType === DatasetCollectionDataProcessModeEnum.qa) { return { chunkSizeField: 'qaChunkSize', chunkOverlapRatio: 0, - maxChunkSize: Math.min(agentModel.maxResponse * 4, agentModel.maxContext * 0.7), - minChunkSize: 4000, - autoChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7), + maxChunkSize: getLLMMaxChunkSize(agentModel), + minChunkSize: 1000, + autoChunkSize: getLLMDefaultChunkSize(agentModel), chunkSize: qaChunkSize, charsPointsPrice: agentModel.charsPointsPrice || 0, priceTip: t('dataset:import.Auto mode Estimated Price Tips', { @@ -237,10 +258,13 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode return { chunkSizeField: 'embeddingChunkSize', chunkOverlapRatio: 0.2, - maxChunkSize: 2048, - minChunkSize: 100, - autoChunkSize: vectorModel?.defaultToken ? vectorModel.defaultToken * 2 : 1024, + maxChunkSize: getMaxChunkSize(agentModel), + minChunkSize: minChunkSize, + autoChunkSize: chunkAutoChunkSize, chunkSize: embeddingChunkSize, + maxIndexSize: getMaxIndexSize(vectorModel), + autoIndexSize: getAutoIndexSize(vectorModel), + indexSize, charsPointsPrice: agentModel.charsPointsPrice || 0, priceTip: t('dataset:import.Auto mode Estimated Price Tips', { price: agentModel.charsPointsPrice @@ -251,10 +275,13 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode return { chunkSizeField: 'embeddingChunkSize', chunkOverlapRatio: 0.2, - maxChunkSize: vectorModel?.maxToken || 512, - minChunkSize: 100, - autoChunkSize: vectorModel?.defaultToken || 512, + maxChunkSize: getMaxChunkSize(agentModel), + minChunkSize: minChunkSize, + autoChunkSize: chunkAutoChunkSize, chunkSize: embeddingChunkSize, + maxIndexSize: getMaxIndexSize(vectorModel), + autoIndexSize: getAutoIndexSize(vectorModel), + indexSize, charsPointsPrice: vectorModel.charsPointsPrice || 0, priceTip: t('dataset:import.Embedding Estimated Price Tips', { price: vectorModel.charsPointsPrice @@ -265,30 +292,36 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode }, [ trainingType, autoIndexes, - agentModel.maxResponse, - agentModel.maxContext, - agentModel.charsPointsPrice, + agentModel, qaChunkSize, t, - vectorModel.defaultToken, - vectorModel?.maxToken, - vectorModel.charsPointsPrice, - embeddingChunkSize + embeddingChunkSize, + vectorModel, + indexSize ]); const chunkSettingModeMap = useMemo(() => { if (chunkSettingMode === ChunkSettingModeEnum.auto) { return { chunkSize: TrainingModeMap.autoChunkSize, - customSplitChar: '' + indexSize: TrainingModeMap.autoIndexSize, + chunkSplitter: '' }; } else { return { chunkSize: TrainingModeMap.chunkSize, - customSplitChar + indexSize: TrainingModeMap.indexSize, + chunkSplitter }; } - }, [chunkSettingMode, TrainingModeMap.autoChunkSize, TrainingModeMap.chunkSize, customSplitChar]); + }, [ + chunkSettingMode, + TrainingModeMap.autoChunkSize, + TrainingModeMap.autoIndexSize, + TrainingModeMap.chunkSize, + TrainingModeMap.indexSize, + chunkSplitter + ]); const contextValue = { ...TrainingModeMap, diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx index 5a1dd5065..d9366a209 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx @@ -20,10 +20,11 @@ import MyIcon from '@fastgpt/web/components/common/Icon'; import { useTranslation } from 'next-i18next'; import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio'; import { + DataChunkSplitModeEnum, DatasetCollectionDataProcessModeEnum, DatasetCollectionDataProcessModeMap } from '@fastgpt/global/core/dataset/constants'; -import { ChunkSettingModeEnum } from '@/web/core/dataset/constants'; +import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants'; import MyTooltip from '@fastgpt/web/components/common/MyTooltip'; import { useSystemStore } from '@/web/common/system/useSystemStore'; import MyModal from '@fastgpt/web/components/common/MyModal'; @@ -37,25 +38,39 @@ import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip'; import { shadowLight } from '@fastgpt/web/styles/theme'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import MySelect from '@fastgpt/web/components/common/MySelect'; +import { getIndexSizeSelectList } from '@fastgpt/global/core/dataset/training/utils'; +import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup'; function DataProcess() { const { t } = useTranslation(); const { feConfigs } = useSystemStore(); - const { goToNext, processParamsForm, chunkSizeField, minChunkSize, maxChunkSize } = - useContextSelector(DatasetImportContext, (v) => v); + const { + goToNext, + processParamsForm, + chunkSizeField, + minChunkSize, + maxChunkSize, + maxIndexSize, + indexSize + } = useContextSelector(DatasetImportContext, (v) => v); const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail); const { setValue, register, watch, getValues } = processParamsForm; const trainingType = watch('trainingType'); - const chunkSettingMode = watch('chunkSettingMode'); + const trainingModeList = useMemo(() => { + const list = Object.entries(DatasetCollectionDataProcessModeMap); + return list + .filter(([key]) => key !== DatasetCollectionDataProcessModeEnum.auto) + .map(([key, value]) => ({ + title: t(value.label as any), + value: key as DatasetCollectionDataProcessModeEnum, + tooltip: t(value.tooltip as any) + })); + }, [t]); - const qaPrompt = watch('qaPrompt'); - const { - isOpen: isOpenCustomPrompt, - onOpen: onOpenCustomPrompt, - onClose: onCloseCustomPrompt - } = useDisclosure(); + const chunkSettingMode = watch('chunkSettingMode'); + const chunkSplitMode = watch('chunkSplitMode'); const customSplitList = [ { label: t('dataset:split_sign_null'), value: '' }, @@ -69,25 +84,25 @@ function DataProcess() { { label: t('dataset:split_sign_custom'), value: 'Other' } ]; - const [customListSelectValue, setCustomListSelectValue] = useState(getValues('customSplitChar')); + const [customListSelectValue, setCustomListSelectValue] = useState(getValues('chunkSplitter')); useEffect(() => { if (customListSelectValue === 'Other') { - setValue('customSplitChar', ''); + setValue('chunkSplitter', ''); } else { - setValue('customSplitChar', customListSelectValue); + setValue('chunkSplitter', customListSelectValue); } }, [customListSelectValue, setValue]); - const trainingModeList = useMemo(() => { - const list = Object.entries(DatasetCollectionDataProcessModeMap); - return list - .filter(([key]) => key !== DatasetCollectionDataProcessModeEnum.auto) - .map(([key, value]) => ({ - title: t(value.label as any), - value: key as DatasetCollectionDataProcessModeEnum, - tooltip: t(value.tooltip as any) - })); - }, [t]); + // Index size + const indexSizeSeletorList = useMemo(() => getIndexSizeSelectList(maxIndexSize), [maxIndexSize]); + + // QA + const qaPrompt = watch('qaPrompt'); + const { + isOpen: isOpenCustomPrompt, + onOpen: onOpenCustomPrompt, + onClose: onCloseCustomPrompt + } = useDisclosure(); const Title = useCallback(({ title }: { title: string }) => { return ( @@ -237,67 +252,97 @@ function DataProcess() { children: chunkSettingMode === ChunkSettingModeEnum.custom && ( - - {t('dataset:ideal_chunk_length')} - - - span': { - display: 'block' + + list={[ + { + title: t('dataset:split_chunk_size'), + value: DataChunkSplitModeEnum.size + }, + { + title: t('dataset:split_chunk_char'), + value: DataChunkSplitModeEnum.char, + tooltip: t('dataset:custom_split_sign_tip') } + ]} + value={chunkSplitMode} + onChange={(e) => { + setValue('chunkSplitMode', e); }} - > - + + {chunkSplitMode === DataChunkSplitModeEnum.size && ( + span': { + display: 'block' + } + }} > - - - + + + + + )} + + {chunkSplitMode === DataChunkSplitModeEnum.char && ( + + + + list={customSplitList} + size={'sm'} + bg={'myGray.50'} + value={customListSelectValue} + h={'32px'} + onChange={(val) => { + setCustomListSelectValue(val); + }} + /> + + {customListSelectValue === 'Other' && ( + + )} + + )} - + {trainingType === DatasetCollectionDataProcessModeEnum.chunk && ( - {t('common:core.dataset.import.Custom split char')} - - - - - - - list={customSplitList} - size={'sm'} + + {t('dataset:index_size')} + + + + bg={'myGray.50'} - value={customListSelectValue} - h={'32px'} + list={indexSizeSeletorList} + value={indexSize} onChange={(val) => { - setCustomListSelectValue(val); + setValue('indexSize', val); }} /> - {customListSelectValue === 'Other' && ( - - )} - - + + )} {showQAPromptInput && ( diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx index 1b2ce5c23..37a09bc2c 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx @@ -16,6 +16,7 @@ import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContex import MyBox from '@fastgpt/web/components/common/MyBox'; import Markdown from '@/components/Markdown'; import { useToast } from '@fastgpt/web/hooks/useToast'; +import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils'; const PreviewData = () => { const { t } = useTranslation(); @@ -23,6 +24,7 @@ const PreviewData = () => { const goToNext = useContextSelector(DatasetImportContext, (v) => v.goToNext); const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId); + const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail); const sources = useContextSelector(DatasetImportContext, (v) => v.sources); const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource); @@ -36,12 +38,13 @@ const PreviewData = () => { async () => { if (!previewFile) return; if (importSource === ImportDataSourceEnum.fileCustom) { - const customSplitChar = processParamsForm.getValues('customSplitChar'); + const chunkSplitter = processParamsForm.getValues('chunkSplitter'); const { chunks } = splitText2Chunks({ text: previewFile.rawText || '', - chunkLen: chunkSize, + chunkSize, + maxSize: getLLMMaxChunkSize(datasetDetail.agentModel), overlapRatio: chunkOverlapRatio, - customReg: customSplitChar ? [customSplitChar] : [] + customReg: chunkSplitter ? [chunkSplitter] : [] }); return chunks.map((chunk) => ({ q: chunk, @@ -61,9 +64,12 @@ const PreviewData = () => { customPdfParse: processParamsForm.getValues('customPdfParse'), + trainingType: processParamsForm.getValues('trainingType'), + chunkSettingMode: processParamsForm.getValues('chunkSettingMode'), + chunkSplitMode: processParamsForm.getValues('chunkSplitMode'), chunkSize, + chunkSplitter: processParamsForm.getValues('chunkSplitter'), overlapRatio: chunkOverlapRatio, - customSplitChar: processParamsForm.getValues('customSplitChar'), selector: processParamsForm.getValues('webSelector'), isQAImport: importSource === ImportDataSourceEnum.csvTable, diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx index 489f9c0f2..45cab2504 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx @@ -49,7 +49,7 @@ const Upload = () => { const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail); const retrainNewCollectionId = useRef(''); - const { importSource, parentId, sources, setSources, processParamsForm, chunkSize } = + const { importSource, parentId, sources, setSources, processParamsForm, chunkSize, indexSize } = useContextSelector(DatasetImportContext, (v) => v); const { handleSubmit } = processParamsForm; @@ -81,7 +81,7 @@ const Upload = () => { }, [waitingFilesCount, totalFilesCount, allFinished, t]); const { runAsync: startUpload, loading: isLoading } = useRequest2( - async ({ trainingType, customSplitChar, qaPrompt, webSelector }: ImportFormType) => { + async ({ trainingType, chunkSplitter, qaPrompt, webSelector }: ImportFormType) => { if (sources.length === 0) return; const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting'); @@ -111,10 +111,16 @@ const Upload = () => { trainingType, imageIndex: processParamsForm.getValues('imageIndex'), autoIndexes: processParamsForm.getValues('autoIndexes'), + + chunkSettingMode: processParamsForm.getValues('chunkSettingMode'), + chunkSplitMode: processParamsForm.getValues('chunkSplitMode'), + chunkSize, - chunkSplitter: customSplitChar, + indexSize, + chunkSplitter, qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined }; + if (importSource === ImportDataSourceEnum.reTraining) { const res = await postReTrainingDatasetFileCollection({ ...commonParams, diff --git a/projects/app/src/pageComponents/dataset/detail/Import/components/PreviewChunks.tsx b/projects/app/src/pageComponents/dataset/detail/Import/components/PreviewChunks.tsx deleted file mode 100644 index 1248764bb..000000000 --- a/projects/app/src/pageComponents/dataset/detail/Import/components/PreviewChunks.tsx +++ /dev/null @@ -1,102 +0,0 @@ -import React from 'react'; -import { Box } from '@chakra-ui/react'; -import { ImportSourceItemType } from '@/web/core/dataset/type'; -import MyRightDrawer from '@fastgpt/web/components/common/MyDrawer/MyRightDrawer'; -import { getPreviewChunks } from '@/web/core/dataset/api'; -import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants'; -import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; -import { useContextSelector } from 'use-context-selector'; -import { DatasetImportContext } from '../Context'; -import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; -import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; -import { getPreviewSourceReadType } from '../utils'; - -const PreviewChunks = ({ - previewSource, - onClose -}: { - previewSource: ImportSourceItemType; - onClose: () => void; -}) => { - const { importSource, chunkSize, chunkOverlapRatio, processParamsForm } = useContextSelector( - DatasetImportContext, - (v) => v - ); - const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId); - - const { data = [], loading: isLoading } = useRequest2( - async () => { - if (importSource === ImportDataSourceEnum.fileCustom) { - const customSplitChar = processParamsForm.getValues('customSplitChar'); - const { chunks } = splitText2Chunks({ - text: previewSource.rawText || '', - chunkLen: chunkSize, - overlapRatio: chunkOverlapRatio, - customReg: customSplitChar ? [customSplitChar] : [] - }); - return chunks.map((chunk) => ({ - q: chunk, - a: '' - })); - } - - return getPreviewChunks({ - datasetId, - type: getPreviewSourceReadType(previewSource), - sourceId: - previewSource.dbFileId || - previewSource.link || - previewSource.externalFileUrl || - previewSource.apiFileId || - '', - - chunkSize, - overlapRatio: chunkOverlapRatio, - customSplitChar: processParamsForm.getValues('customSplitChar'), - - selector: processParamsForm.getValues('webSelector'), - isQAImport: importSource === ImportDataSourceEnum.csvTable, - externalFileId: previewSource.externalFileId - }); - }, - { - manual: false - } - ); - - return ( - - - {data.map((item, index) => ( - - {item.q} - {item.a} - - ))} - - - ); -}; - -export default React.memo(PreviewChunks); diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx index 068e0ea64..23f28ced3 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx @@ -8,10 +8,11 @@ import { useRouter } from 'next/router'; import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; import { getDatasetCollectionById } from '@/web/core/dataset/api'; import MyBox from '@fastgpt/web/components/common/MyBox'; -import { ChunkSettingModeEnum } from '@/web/core/dataset/constants'; +import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils'; -import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import { Box } from '@chakra-ui/react'; +import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants'; +import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent'; const Upload = dynamic(() => import('../commonProgress/Upload')); const PreviewData = dynamic(() => import('../commonProgress/PreviewData')); @@ -23,7 +24,6 @@ const ReTraining = () => { collectionId: string; }; - const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail); const activeStep = useContextSelector(DatasetImportContext, (v) => v.activeStep); const setSources = useContextSelector(DatasetImportContext, (v) => v.setSources); const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm); @@ -46,18 +46,21 @@ const ReTraining = () => { uploadedFileRate: 100 } ]); + processParamsForm.reset({ customPdfParse: collection.customPdfParse, trainingType: collection.trainingType, imageIndex: collection.imageIndex, autoIndexes: collection.autoIndexes, - chunkSettingMode: ChunkSettingModeEnum.auto, + chunkSettingMode: collection.chunkSettingMode || ChunkSettingModeEnum.auto, + chunkSplitMode: collection.chunkSplitMode || DataChunkSplitModeEnum.size, embeddingChunkSize: collection.chunkSize, qaChunkSize: collection.chunkSize, - customSplitChar: collection.chunkSplitter, - qaPrompt: collection.qaPrompt, - webSelector: collection.metadata?.webPageSelector + indexSize: collection.indexSize || 512, + chunkSplitter: collection.chunkSplitter, + webSelector: collection.metadata?.webPageSelector, + qaPrompt: collection.qaPrompt || Prompt_AgentQA.description }); } }); diff --git a/projects/app/src/pages/account/info/index.tsx b/projects/app/src/pages/account/info/index.tsx index ff650630b..963177a00 100644 --- a/projects/app/src/pages/account/info/index.tsx +++ b/projects/app/src/pages/account/info/index.tsx @@ -294,7 +294,7 @@ const MyInfo = ({ onOpenContact }: { onOpenContact: () => void }) => { title={t('account_info:click_modify_nickname')} borderColor={'transparent'} transform={'translateX(-11px)'} - maxLength={20} + maxLength={100} onBlur={async (e) => { const val = e.target.value; if (val === userInfo?.team?.memberName) return; diff --git a/projects/app/src/pages/api/core/dataset/collection/create/reTrainingCollection.ts b/projects/app/src/pages/api/core/dataset/collection/create/reTrainingCollection.ts index 0a5353d3c..919014a31 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/reTrainingCollection.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/reTrainingCollection.ts @@ -2,8 +2,7 @@ import { reTrainingDatasetFileCollectionParams } from '@fastgpt/global/core/data import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller'; import { DatasetCollectionTypeEnum, - DatasetSourceReadTypeEnum, - TrainingModeEnum + DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants'; import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun'; import { hashStr } from '@fastgpt/global/common/string/tools'; diff --git a/projects/app/src/pages/api/core/dataset/data/insertData.ts b/projects/app/src/pages/api/core/dataset/data/insertData.ts index e99e51626..f1b4f0ac5 100644 --- a/projects/app/src/pages/api/core/dataset/data/insertData.ts +++ b/projects/app/src/pages/api/core/dataset/data/insertData.ts @@ -4,7 +4,7 @@ */ import type { NextApiRequest } from 'next'; import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken/index'; -import { getEmbeddingModel } from '@fastgpt/service/core/ai/model'; +import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model'; import { hasSameValue } from '@/service/core/dataset/data/utils'; import { insertData2Dataset } from '@/service/core/dataset/data/controller'; import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth'; @@ -16,6 +16,7 @@ import { checkDatasetLimit } from '@fastgpt/service/support/permission/teamLimit import { NextAPI } from '@/service/middleware/entry'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; +import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils'; async function handler(req: NextApiRequest) { const { collectionId, q, a, indexes } = req.body as InsertOneDatasetDataProps; @@ -45,7 +46,7 @@ async function handler(req: NextApiRequest) { // auth collection and get dataset const [ { - dataset: { _id: datasetId, vectorModel } + dataset: { _id: datasetId, vectorModel, agentModel } } ] = await Promise.all([getCollectionWithDataset(collectionId)]); @@ -60,9 +61,11 @@ async function handler(req: NextApiRequest) { // token check const token = await countPromptTokens(formatQ + formatA, ''); const vectorModelData = getEmbeddingModel(vectorModel); + const llmModelData = getLLMModel(agentModel); + const maxChunkSize = getLLMMaxChunkSize(llmModelData); - if (token > vectorModelData.maxToken) { - return Promise.reject('Q Over Tokens'); + if (token > maxChunkSize) { + return Promise.reject(`Content over max chunk size: ${maxChunkSize}`); } // Duplicate data check @@ -82,7 +85,7 @@ async function handler(req: NextApiRequest) { q: formatQ, a: formatA, chunkIndex: 0, - model: vectorModelData.model, + embeddingModel: vectorModelData.model, indexes: formatIndexes }); diff --git a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts index 71ade6289..caba7d922 100644 --- a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts +++ b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts @@ -1,4 +1,9 @@ -import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants'; +import { + ChunkSettingModeEnum, + DataChunkSplitModeEnum, + DatasetCollectionDataProcessModeEnum, + DatasetSourceReadTypeEnum +} from '@fastgpt/global/core/dataset/constants'; import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read'; import { NextAPI } from '@/service/middleware/entry'; import { ApiRequestProps } from '@fastgpt/service/type/next'; @@ -8,17 +13,30 @@ import { } from '@fastgpt/global/support/permission/constant'; import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file'; import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; +import { + computeChunkSize, + computeChunkSplitter, + getLLMMaxChunkSize +} from '@fastgpt/global/core/dataset/training/utils'; +import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; +import { getLLMModel } from '@fastgpt/service/core/ai/model'; export type PostPreviewFilesChunksProps = { datasetId: string; type: DatasetSourceReadTypeEnum; sourceId: string; - chunkSize: number; - overlapRatio: number; - customSplitChar?: string; customPdfParse?: boolean; + trainingType: DatasetCollectionDataProcessModeEnum; + + // Chunk settings + chunkSettingMode: ChunkSettingModeEnum; + chunkSplitMode: DataChunkSplitModeEnum; + chunkSize: number; + chunkSplitter?: string; + overlapRatio: number; + // Read params selector?: string; isQAImport?: boolean; @@ -32,55 +50,64 @@ export type PreviewChunksResponse = { async function handler( req: ApiRequestProps ): Promise { - const { + let { type, sourceId, + customPdfParse = false, + + trainingType, + chunkSettingMode, + chunkSplitMode, chunkSize, - customSplitChar, + chunkSplitter, + overlapRatio, selector, isQAImport, datasetId, - externalFileId, - customPdfParse = false + externalFileId } = req.body; if (!sourceId) { throw new Error('sourceId is empty'); } - if (chunkSize > 30000) { - throw new Error('chunkSize is too large, should be less than 30000'); + + const fileAuthRes = + type === DatasetSourceReadTypeEnum.fileLocal + ? await authCollectionFile({ + req, + authToken: true, + authApiKey: true, + fileId: sourceId, + per: OwnerPermissionVal + }) + : undefined; + + const { dataset, teamId, tmbId } = await authDataset({ + req, + authApiKey: true, + authToken: true, + datasetId, + per: WritePermissionVal + }); + + if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) { + return Promise.reject(CommonErrEnum.unAuthFile); } - const { teamId, tmbId, apiServer, feishuServer, yuqueServer } = await (async () => { - if (type === DatasetSourceReadTypeEnum.fileLocal) { - const res = await authCollectionFile({ - req, - authToken: true, - authApiKey: true, - fileId: sourceId, - per: OwnerPermissionVal - }); - return { - teamId: res.teamId, - tmbId: res.tmbId - }; - } - const { dataset, teamId, tmbId } = await authDataset({ - req, - authApiKey: true, - authToken: true, - datasetId, - per: WritePermissionVal - }); - return { - teamId, - tmbId, - apiServer: dataset.apiServer, - feishuServer: dataset.feishuServer, - yuqueServer: dataset.yuqueServer - }; - })(); + chunkSize = computeChunkSize({ + trainingType, + chunkSettingMode, + chunkSplitMode, + chunkSize, + llmModel: getLLMModel(dataset.agentModel) + }); + + chunkSplitter = computeChunkSplitter({ + chunkSettingMode, + chunkSplitMode, + chunkSplitter + }); const { rawText } = await readDatasetSourceRawText({ teamId, @@ -89,18 +116,19 @@ async function handler( sourceId, selector, isQAImport, - apiServer, - feishuServer, - yuqueServer, + apiServer: dataset.apiServer, + feishuServer: dataset.feishuServer, + yuqueServer: dataset.yuqueServer, externalFileId, customPdfParse }); return rawText2Chunks({ rawText, - chunkLen: chunkSize, + chunkSize, + maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)), overlapRatio, - customReg: customSplitChar ? [customSplitChar] : [], + customReg: chunkSplitter ? [chunkSplitter] : [], isQAImport: isQAImport }).slice(0, 10); } diff --git a/projects/app/src/service/core/dataset/data/controller.ts b/projects/app/src/service/core/dataset/data/controller.ts index b5132e2de..3867b56e1 100644 --- a/projects/app/src/service/core/dataset/data/controller.ts +++ b/projects/app/src/service/core/dataset/data/controller.ts @@ -5,25 +5,63 @@ import { UpdateDatasetDataProps } from '@fastgpt/global/core/dataset/controller'; import { insertDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller'; -import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils'; import { jiebaSplit } from '@fastgpt/service/common/string/jieba/index'; import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller'; import { DatasetDataIndexItemType, DatasetDataItemType } from '@fastgpt/global/core/dataset/type'; -import { getEmbeddingModel } from '@fastgpt/service/core/ai/model'; +import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model'; import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun'; import { ClientSession } from '@fastgpt/service/common/mongo'; import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema'; import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; +import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; +import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken'; +import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils'; -const formatIndexes = ({ +const formatIndexes = async ({ indexes, q, - a = '' + a = '', + indexSize }: { indexes?: (Omit & { dataId?: string })[]; q: string; a?: string; -}) => { + indexSize: number; +}): Promise< + { + type: `${DatasetDataIndexTypeEnum}`; + text: string; + dataId?: string; + }[] +> => { + /* get dataset data default index */ + const getDefaultIndex = ({ + q = '', + a, + indexSize + }: { + q?: string; + a?: string; + indexSize: number; + }) => { + const qChunks = splitText2Chunks({ + text: q, + chunkSize: indexSize + }).chunks; + const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : []; + + return [ + ...qChunks.map((text) => ({ + text, + type: DatasetDataIndexTypeEnum.default + })), + ...aChunks.map((text) => ({ + text, + type: DatasetDataIndexTypeEnum.default + })) + ]; + }; + indexes = indexes || []; // If index not type, set it to custom indexes = indexes @@ -35,7 +73,7 @@ const formatIndexes = ({ .filter((item) => !!item.text.trim()); // Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds - const defaultIndexes = getDefaultIndex({ q, a }); + const defaultIndexes = getDefaultIndex({ q, a, indexSize }); const concatDefaultIndexes = defaultIndexes.map((item) => { const oldIndex = indexes!.find((index) => index.text === item.text); if (oldIndex) { @@ -56,11 +94,24 @@ const formatIndexes = ({ (item, index, self) => index === self.findIndex((t) => t.text === item.text) ); - return indexes.map((index) => ({ - type: index.type, - text: index.text, - dataId: index.dataId - })); + const chekcIndexes = ( + await Promise.all( + indexes.map(async (item) => { + // If oversize tokens, split it + const tokens = await countPromptTokens(item.text); + if (tokens > indexSize) { + const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks; + return splitText.map((text) => ({ + text, + type: item.type + })); + } + return item; + }) + ) + ).flat(); + + return chekcIndexes; }; /* insert data. * 1. create data id @@ -75,30 +126,40 @@ export async function insertData2Dataset({ q, a = '', chunkIndex = 0, + indexSize = 512, indexes, - model, + embeddingModel, session }: CreateDatasetDataProps & { - model: string; + embeddingModel: string; + indexSize?: number; session?: ClientSession; }) { - if (!q || !datasetId || !collectionId || !model) { - return Promise.reject('q, datasetId, collectionId, model is required'); + if (!q || !datasetId || !collectionId || !embeddingModel) { + return Promise.reject('q, datasetId, collectionId, embeddingModel is required'); } if (String(teamId) === String(tmbId)) { return Promise.reject("teamId and tmbId can't be the same"); } + const embModel = getEmbeddingModel(embeddingModel); + indexSize = Math.min(embModel.maxToken, indexSize); + // 1. Get vector indexes and insert // Empty indexes check, if empty, create default index - const newIndexes = formatIndexes({ indexes, q, a }); + const newIndexes = await formatIndexes({ + indexes, + q, + a, + indexSize + }); // insert to vector store const result = await Promise.all( newIndexes.map(async (item) => { const result = await insertDatasetDataVector({ query: item.text, - model: getEmbeddingModel(model), + model: embModel, teamId, datasetId, collectionId @@ -163,8 +224,9 @@ export async function updateData2Dataset({ q = '', a, indexes, - model -}: UpdateDatasetDataProps & { model: string }) { + model, + indexSize = 512 +}: UpdateDatasetDataProps & { model: string; indexSize?: number }) { if (!Array.isArray(indexes)) { return Promise.reject('indexes is required'); } @@ -174,7 +236,7 @@ export async function updateData2Dataset({ if (!mongoData) return Promise.reject('core.dataset.error.Data not found'); // 2. Compute indexes - const formatIndexesResult = formatIndexes({ indexes, q, a }); + const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize }); // 3. Patch indexes, create, update, delete const patchResult: PatchIndexesProps[] = []; diff --git a/projects/app/src/service/events/generateQA.ts b/projects/app/src/service/events/generateQA.ts index 4a335cc22..2273d1edf 100644 --- a/projects/app/src/service/events/generateQA.ts +++ b/projects/app/src/service/events/generateQA.ts @@ -21,6 +21,11 @@ import { llmCompletionsBodyFormat, llmStreamResponseToAnswerText } from '@fastgpt/service/core/ai/utils'; +import { LLMModelItemType } from '@fastgpt/global/core/ai/model.d'; +import { + chunkAutoChunkSize, + getLLMMaxChunkSize +} from '@fastgpt/global/core/dataset/training/utils'; const reduceQueue = () => { global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0; @@ -129,7 +134,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`; }); const answer = await llmStreamResponseToAnswerText(chatResponse); - const qaArr = formatSplitText(answer, text); // 格式化后的QA对 + const qaArr = formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对 addLog.info(`[QA Queue] Finish`, { time: Date.now() - startTime, @@ -180,10 +185,18 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`; } // Format qa answer -function formatSplitText(text: string, rawText: string) { - text = text.replace(/\\n/g, '\n'); // 将换行符替换为空格 +function formatSplitText({ + answer, + rawText, + llmModel +}: { + answer: string; + rawText: string; + llmModel: LLMModelItemType; +}) { + answer = answer.replace(/\\n/g, '\n'); // 将换行符替换为空格 const regex = /Q\d+:(\s*)(.*)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q\d|$)/g; // 匹配Q和A的正则表达式 - const matches = text.matchAll(regex); // 获取所有匹配到的结果 + const matches = answer.matchAll(regex); // 获取所有匹配到的结果 const result: PushDatasetDataChunkProps[] = []; // 存储最终的结果 for (const match of matches) { @@ -199,7 +212,11 @@ function formatSplitText(text: string, rawText: string) { // empty result. direct split chunk if (result.length === 0) { - const { chunks } = splitText2Chunks({ text: rawText, chunkLen: 512 }); + const { chunks } = splitText2Chunks({ + text: rawText, + chunkSize: chunkAutoChunkSize, + maxSize: getLLMMaxChunkSize(llmModel) + }); chunks.forEach((chunk) => { result.push({ q: chunk, diff --git a/projects/app/src/service/events/generateVector.ts b/projects/app/src/service/events/generateVector.ts index a8e4f5ca9..0e7f8c2b4 100644 --- a/projects/app/src/service/events/generateVector.ts +++ b/projects/app/src/service/events/generateVector.ts @@ -245,7 +245,7 @@ const insertData = async ({ a: trainingData.a, chunkIndex: trainingData.chunkIndex, indexes: trainingData.indexes, - model: trainingData.model, + embeddingModel: trainingData.model, session }); // delete data from training diff --git a/projects/app/src/web/core/dataset/constants.ts b/projects/app/src/web/core/dataset/constants.ts index 860e73502..3976d93ee 100644 --- a/projects/app/src/web/core/dataset/constants.ts +++ b/projects/app/src/web/core/dataset/constants.ts @@ -60,15 +60,11 @@ export const defaultCollectionDetail: DatasetCollectionItemType = { createTime: new Date(), trainingType: DatasetCollectionDataProcessModeEnum.chunk, chunkSize: 0, + indexSize: 512, permission: new DatasetPermission(), indexAmount: 0 }; -export enum ChunkSettingModeEnum { - auto = 'auto', - custom = 'custom' -} - export const datasetTypeCourseMap: Record<`${DatasetTypeEnum}`, string> = { [DatasetTypeEnum.folder]: '', [DatasetTypeEnum.dataset]: '', diff --git a/projects/app/src/web/core/dataset/type.d.ts b/projects/app/src/web/core/dataset/type.d.ts index a095bc798..0f69d9656 100644 --- a/projects/app/src/web/core/dataset/type.d.ts +++ b/projects/app/src/web/core/dataset/type.d.ts @@ -1,6 +1,6 @@ import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; -import { ChunkSettingModeEnum } from './constants'; +import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { UseFormReturn } from 'react-hook-form'; import { APIFileItem } from '@fastgpt/global/core/dataset/apiDataset'; @@ -41,7 +41,7 @@ export type ImportSourceParamsType = UseFormReturn< { chunkSize: number; chunkOverlapRatio: number; - customSplitChar: string; + chunkSplitter: string; prompt: string; mode: TrainingModeEnum; way: ChunkSettingModeEnum;