From 0f866fc55268fe37c5606f5e49df2c0f3f62aade Mon Sep 17 00:00:00 2001 From: Archer <545436317@qq.com> Date: Thu, 29 May 2025 17:57:27 +0800 Subject: [PATCH] feat: text collecion auto save for a txt file (#4924) --- packages/global/core/dataset/utils.ts | 1 + .../service/common/file/gridfs/controller.ts | 2 +- packages/service/common/file/gridfs/utils.ts | 52 +++++++++++++++++++ .../Import/diffSource/FileCustomText.tsx | 2 +- .../core/dataset/collection/create/text.ts | 19 +++++-- .../dataset/queues}/generateQA.ts | 2 +- .../dataset/queues}/generateVector.ts | 0 .../{events => core/dataset/queues}/utils.ts | 4 +- .../service/core/dataset/training/utils.ts | 4 +- 9 files changed, 76 insertions(+), 10 deletions(-) rename projects/app/src/service/{events => core/dataset/queues}/generateQA.ts (98%) rename projects/app/src/service/{events => core/dataset/queues}/generateVector.ts (100%) rename projects/app/src/service/{events => core/dataset/queues}/utils.ts (86%) diff --git a/packages/global/core/dataset/utils.ts b/packages/global/core/dataset/utils.ts index 803dce59b..17167b12a 100644 --- a/packages/global/core/dataset/utils.ts +++ b/packages/global/core/dataset/utils.ts @@ -40,5 +40,6 @@ export function getSourceNameIcon({ export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => { if (mode === TrainingModeEnum.qa) return data.length * 20; if (mode === TrainingModeEnum.auto) return data.length * 5; + if (mode === TrainingModeEnum.image) return data.length * 2; return data.length; }; diff --git a/packages/service/common/file/gridfs/controller.ts b/packages/service/common/file/gridfs/controller.ts index c85e8474f..05708ed20 100644 --- a/packages/service/common/file/gridfs/controller.ts +++ b/packages/service/common/file/gridfs/controller.ts @@ -223,7 +223,7 @@ export const readFileContentFromMongo = async ({ rawText: string; filename: string; }> => { - const bufferId = `${fileId}-${customPdfParse}`; + const bufferId = `${String(fileId)}-${customPdfParse}`; // read buffer const fileBuffer = await getRawTextBuffer(bufferId); if (fileBuffer) { diff --git a/packages/service/common/file/gridfs/utils.ts b/packages/service/common/file/gridfs/utils.ts index 9e376f28f..c743b7136 100644 --- a/packages/service/common/file/gridfs/utils.ts +++ b/packages/service/common/file/gridfs/utils.ts @@ -1,5 +1,57 @@ import { detectFileEncoding } from '@fastgpt/global/common/file/tools'; import { PassThrough } from 'stream'; +import { getGridBucket } from './controller'; +import { type BucketNameEnum } from '@fastgpt/global/common/file/constants'; +import { retryFn } from '@fastgpt/global/common/system/utils'; + +export const createFileFromText = async ({ + bucket, + filename, + text, + metadata +}: { + bucket: `${BucketNameEnum}`; + filename: string; + text: string; + metadata: Record; +}) => { + const gridBucket = getGridBucket(bucket); + + const buffer = Buffer.from(text); + + const fileSize = buffer.length; + // 单块大小:尽可能大,但不超过 14MB,不小于128KB + const chunkSizeBytes = (() => { + // 计算理想块大小:文件大小 ÷ 目标块数(10)。 并且每个块需要小于 14MB + const idealChunkSize = Math.min(Math.ceil(fileSize / 10), 14 * 1024 * 1024); + + // 确保块大小至少为128KB + const minChunkSize = 128 * 1024; // 128KB + + // 取理想块大小和最小块大小中的较大值 + let chunkSize = Math.max(idealChunkSize, minChunkSize); + + // 将块大小向上取整到最接近的64KB的倍数,使其更整齐 + chunkSize = Math.ceil(chunkSize / (64 * 1024)) * (64 * 1024); + + return chunkSize; + })(); + + const uploadStream = gridBucket.openUploadStream(filename, { + metadata, + chunkSizeBytes + }); + + return retryFn(async () => { + return new Promise<{ fileId: string }>((resolve, reject) => { + uploadStream.end(buffer); + uploadStream.on('finish', () => { + resolve({ fileId: String(uploadStream.id) }); + }); + uploadStream.on('error', reject); + }); + }); +}; export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => { return new Promise((resolve, reject) => { diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileCustomText.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileCustomText.tsx index a5b3f05ed..7d3cd3ee0 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileCustomText.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileCustomText.tsx @@ -49,7 +49,7 @@ const CustomTextInput = () => { createStatus: 'waiting', rawText: data.value, sourceName: data.name, - icon: 'file/fill/manual' + icon: 'file/fill/txt' } ]); goToNext(); diff --git a/projects/app/src/pages/api/core/dataset/collection/create/text.ts b/projects/app/src/pages/api/core/dataset/collection/create/text.ts index d2d41ad2f..3a75aa5b5 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/text.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/text.ts @@ -6,6 +6,7 @@ import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant import { NextAPI } from '@/service/middleware/entry'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; import { type CreateCollectionResponse } from '@/global/core/dataset/api'; +import { createFileFromText } from '@fastgpt/service/common/file/gridfs/utils'; async function handler(req: NextApiRequest): CreateCollectionResponse { const { name, text, ...body } = req.body as TextCreateDatasetCollectionParams; @@ -18,6 +19,18 @@ async function handler(req: NextApiRequest): CreateCollectionResponse { per: WritePermissionVal }); + // 1. Create file from text + const filename = `${name}.txt`; + const { fileId } = await createFileFromText({ + bucket: 'dataset', + filename, + text, + metadata: { + teamId, + uid: tmbId + } + }); + const { collectionId, insertResults } = await createCollectionAndInsertData({ dataset, rawText: text, @@ -25,9 +38,9 @@ async function handler(req: NextApiRequest): CreateCollectionResponse { ...body, teamId, tmbId, - type: DatasetCollectionTypeEnum.virtual, - - name + type: DatasetCollectionTypeEnum.file, + fileId, + name: filename } }); diff --git a/projects/app/src/service/events/generateQA.ts b/projects/app/src/service/core/dataset/queues/generateQA.ts similarity index 98% rename from projects/app/src/service/events/generateQA.ts rename to projects/app/src/service/core/dataset/queues/generateQA.ts index 4584fc8ca..e4fb1d355 100644 --- a/projects/app/src/service/events/generateQA.ts +++ b/projects/app/src/service/core/dataset/queues/generateQA.ts @@ -2,7 +2,7 @@ import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/sch import { pushQAUsage } from '@/service/support/wallet/usage/push'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { createChatCompletion } from '@fastgpt/service/core/ai/config'; -import type { ChatCompletionMessageParam, StreamChatType } from '@fastgpt/global/core/ai/type.d'; +import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type.d'; import { addLog } from '@fastgpt/service/common/system/log'; import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; import { replaceVariable } from '@fastgpt/global/common/string/tools'; diff --git a/projects/app/src/service/events/generateVector.ts b/projects/app/src/service/core/dataset/queues/generateVector.ts similarity index 100% rename from projects/app/src/service/events/generateVector.ts rename to projects/app/src/service/core/dataset/queues/generateVector.ts diff --git a/projects/app/src/service/events/utils.ts b/projects/app/src/service/core/dataset/queues/utils.ts similarity index 86% rename from projects/app/src/service/events/utils.ts rename to projects/app/src/service/core/dataset/queues/utils.ts index 036b71235..1926f7cfc 100644 --- a/projects/app/src/service/events/utils.ts +++ b/projects/app/src/service/core/dataset/queues/utils.ts @@ -1,6 +1,6 @@ import { TeamErrEnum } from '@fastgpt/global/common/error/code/team'; import { checkTeamAIPoints } from '@fastgpt/service/support/permission/teamLimit'; -import { sendOneInform } from '../support/user/inform/api'; +import { sendOneInform } from '../../../support/user/inform/api'; import { lockTrainingDataByTeamId } from '@fastgpt/service/core/dataset/training/controller'; import { InformLevelEnum } from '@fastgpt/global/support/user/inform/constants'; @@ -18,7 +18,7 @@ export const checkTeamAiPointsAndLock = async (teamId: string) => { templateParam: {}, teamId }); - console.log('余额不足,暂停【向量】生成任务'); + console.log('余额不足,暂停训练生成任务'); await lockTrainingDataByTeamId(teamId); } catch (error) {} } diff --git a/projects/app/src/service/core/dataset/training/utils.ts b/projects/app/src/service/core/dataset/training/utils.ts index 8f0159951..be3a57449 100644 --- a/projects/app/src/service/core/dataset/training/utils.ts +++ b/projects/app/src/service/core/dataset/training/utils.ts @@ -1,5 +1,5 @@ -import { generateQA } from '@/service/events/generateQA'; -import { generateVector } from '@/service/events/generateVector'; +import { generateQA } from '@/service/core/dataset/queues/generateQA'; +import { generateVector } from '@/service/core/dataset/queues/generateVector'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { type DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type'; import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';