diff --git a/.vscode/nextapi.code-snippets b/.vscode/nextapi.code-snippets index e597e0bac..c9c084ed0 100644 --- a/.vscode/nextapi.code-snippets +++ b/.vscode/nextapi.code-snippets @@ -20,9 +20,9 @@ "export type ${TM_FILENAME_BASE}Response = {};", "", "async function handler(", - " req: ApiRequestProps,", + " req: ApiRequestProps<${TM_FILENAME_BASE}Body, ${TM_FILENAME_BASE}Query>,", " res: ApiResponseType", - "): Promise {", + "): Promise<${TM_FILENAME_BASE}Response> {", " $1", " return {}", "}", diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts index 1a6d02360..7ef609a74 100644 --- a/packages/global/common/string/textSplitter.ts +++ b/packages/global/common/string/textSplitter.ts @@ -9,6 +9,9 @@ type SplitProps = { overlapRatio?: number; customReg?: string[]; }; +export type TextSplitProps = Omit & { + chunkLen?: number; +}; type SplitResponse = { chunks: string[]; @@ -49,6 +52,7 @@ const strIsMdTable = (str: string) => { return false; } } + return true; }; const markdownTableSplit = (props: SplitProps): SplitResponse => { @@ -77,6 +81,10 @@ ${mdSplitString} chunk += `${splitText2Lines[i]}\n`; } + if (chunk) { + chunks.push(chunk); + } + return { chunks, chars: chunks.reduce((sum, chunk) => sum + chunk.length, 0) diff --git a/packages/global/common/system/types/index.d.ts b/packages/global/common/system/types/index.d.ts index b1802188a..5a861de77 100644 --- a/packages/global/common/system/types/index.d.ts +++ b/packages/global/common/system/types/index.d.ts @@ -66,6 +66,8 @@ export type SystemEnvType = { vectorMaxProcess: number; qaMaxProcess: number; pgHNSWEfSearch: number; + tokenWorkers: number; // token count max worker + oneapiUrl?: string; chatApiKey?: string; }; diff --git a/packages/global/core/dataset/constants.ts b/packages/global/core/dataset/constants.ts index 570fc1417..76fe57e45 100644 --- a/packages/global/core/dataset/constants.ts +++ b/packages/global/core/dataset/constants.ts @@ -170,3 +170,10 @@ export const SearchScoreTypeMap = { export const CustomCollectionIcon = 'common/linkBlue'; export const LinkCollectionIcon = 'common/linkBlue'; + +/* source prefix */ +export enum DatasetSourceReadTypeEnum { + fileLocal = 'fileLocal', + link = 'link', + externalFile = 'externalFile' +} diff --git a/packages/global/core/dataset/read.ts b/packages/global/core/dataset/read.ts new file mode 100644 index 000000000..36ce6c798 --- /dev/null +++ b/packages/global/core/dataset/read.ts @@ -0,0 +1,16 @@ +import { DatasetSourceReadTypeEnum, ImportDataSourceEnum } from './constants'; + +export const rawTextBackupPrefix = 'index,content'; + +export const importType2ReadType = (type: ImportDataSourceEnum) => { + if (type === ImportDataSourceEnum.csvTable || type === ImportDataSourceEnum.fileLocal) { + return DatasetSourceReadTypeEnum.fileLocal; + } + if (type === ImportDataSourceEnum.fileLink) { + return DatasetSourceReadTypeEnum.link; + } + if (type === ImportDataSourceEnum.externalFile) { + return DatasetSourceReadTypeEnum.externalFile; + } + return DatasetSourceReadTypeEnum.link; +}; diff --git a/packages/service/common/file/gridfs/controller.ts b/packages/service/common/file/gridfs/controller.ts index daeda115e..862e5e727 100644 --- a/packages/service/common/file/gridfs/controller.ts +++ b/packages/service/common/file/gridfs/controller.ts @@ -151,12 +151,12 @@ export const readFileContentFromMongo = async ({ teamId, bucketName, fileId, - csvFormat = false + isQAImport = false }: { teamId: string; bucketName: `${BucketNameEnum}`; fileId: string; - csvFormat?: boolean; + isQAImport?: boolean; }): Promise<{ rawText: string; filename: string; @@ -198,7 +198,7 @@ export const readFileContentFromMongo = async ({ const { rawText } = await readFileRawContent({ extension, - csvFormat, + isQAImport, teamId, buffer: fileBuffers, encoding, diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index c35b671fc..9c6bf7fd2 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -5,6 +5,7 @@ import { addHours } from 'date-fns'; import { WorkerNameEnum, runWorker } from '../../../worker/utils'; import { ReadFileResponse } from '../../../worker/file/type'; +import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read'; export const initMarkdownText = ({ teamId, @@ -29,36 +30,44 @@ export const initMarkdownText = ({ export const readFileRawContent = async ({ extension, - csvFormat, + isQAImport, teamId, buffer, encoding, metadata }: { - csvFormat?: boolean; + isQAImport?: boolean; extension: string; teamId: string; buffer: Buffer; encoding: string; metadata?: Record; }) => { - const result = await runWorker(WorkerNameEnum.readFile, { + let { rawText, formatText } = await runWorker(WorkerNameEnum.readFile, { extension, - csvFormat, encoding, buffer }); // markdown data format if (['md', 'html', 'docx'].includes(extension)) { - result.rawText = await initMarkdownText({ + rawText = await initMarkdownText({ teamId: teamId, - md: result.rawText, + md: rawText, metadata: metadata }); } - return result; + if (['csv', 'xlsx'].includes(extension)) { + // qa data + if (isQAImport) { + rawText = rawText || ''; + } else { + rawText = formatText || ''; + } + } + + return { rawText }; }; export const htmlToMarkdown = async (html?: string | null) => { diff --git a/packages/service/common/string/cheerio.ts b/packages/service/common/string/cheerio.ts index d7056fdb4..05ee9c63a 100644 --- a/packages/service/common/string/cheerio.ts +++ b/packages/service/common/string/cheerio.ts @@ -77,9 +77,8 @@ export const urlsFetch = async ({ $, selector }); - console.log('html====', html); + const md = await htmlToMarkdown(html); - console.log('html====', md); return { url, diff --git a/packages/service/common/string/tiktoken/index.ts b/packages/service/common/string/tiktoken/index.ts index cf3e81307..07137b0fd 100644 --- a/packages/service/common/string/tiktoken/index.ts +++ b/packages/service/common/string/tiktoken/index.ts @@ -12,27 +12,34 @@ import { getNanoid } from '@fastgpt/global/common/string/tools'; import { addLog } from '../../system/log'; export const getTiktokenWorker = () => { - if (global.tiktokenWorker) { - return global.tiktokenWorker; + const maxWorkers = global.systemEnv?.tokenWorkers || 20; + + if (!global.tiktokenWorkers) { + global.tiktokenWorkers = []; + } + + if (global.tiktokenWorkers.length >= maxWorkers) { + return global.tiktokenWorkers[Math.floor(Math.random() * global.tiktokenWorkers.length)]; } const worker = getWorker(WorkerNameEnum.countGptMessagesTokens); + const i = global.tiktokenWorkers.push({ + index: global.tiktokenWorkers.length, + worker, + callbackMap: {} + }); + worker.on('message', ({ id, data }: { id: string; data: number }) => { - const callback = global.tiktokenWorker?.callbackMap?.[id]; + const callback = global.tiktokenWorkers[i - 1]?.callbackMap?.[id]; if (callback) { callback?.(data); - delete global.tiktokenWorker.callbackMap[id]; + delete global.tiktokenWorkers[i - 1].callbackMap[id]; } }); - global.tiktokenWorker = { - worker, - callbackMap: {} - }; - - return global.tiktokenWorker; + return global.tiktokenWorkers[i - 1]; }; export const countGptMessagesTokens = ( @@ -44,20 +51,29 @@ export const countGptMessagesTokens = ( const start = Date.now(); const { worker, callbackMap } = getTiktokenWorker(); + const id = getNanoid(); const timer = setTimeout(() => { - resolve(0); + console.log('Count token Time out'); + resolve( + messages.reduce((sum, item) => { + if (item.content) { + return sum + item.content.length * 0.5; + } + return sum; + }, 0) + ); delete callbackMap[id]; - }, 300); + }, 60000); callbackMap[id] = (data) => { + // 检测是否有内存泄漏 + addLog.info(`Count token time: ${Date.now() - start}, token: ${data}`); + // console.log(process.memoryUsage()); + resolve(data); clearTimeout(timer); - - // 检测是否有内存泄漏 - // addLog.info(`Count token time: ${Date.now() - start}, token: ${data}`); - // console.log(process.memoryUsage()); }; worker.postMessage({ diff --git a/packages/service/core/dataset/read.ts b/packages/service/core/dataset/read.ts new file mode 100644 index 000000000..5b9902b33 --- /dev/null +++ b/packages/service/core/dataset/read.ts @@ -0,0 +1,99 @@ +import { BucketNameEnum } from '@fastgpt/global/common/file/constants'; +import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants'; +import { readFileContentFromMongo } from '../../common/file/gridfs/controller'; +import { urlsFetch } from '../../common/string/cheerio'; +import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read'; +import { parseCsvTable2Chunks } from './training/utils'; +import { TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; +import axios from 'axios'; +import { readFileRawContent } from '../../common/file/read/utils'; + +export const readFileRawTextByUrl = async ({ teamId, url }: { teamId: string; url: string }) => { + const response = await axios({ + method: 'get', + url: url, + responseType: 'arraybuffer' + }); + const extension = url.split('.')?.pop()?.toLowerCase() || ''; + + const buffer = Buffer.from(response.data, 'binary'); + + const { rawText } = await readFileRawContent({ + extension, + teamId, + buffer, + encoding: 'utf-8' + }); + + return rawText; +}; + +/* + fileId - local file, read from mongo + link - request + externalFile = request read +*/ +export const readDatasetSourceRawText = async ({ + teamId, + type, + sourceId, + isQAImport, + selector +}: { + teamId: string; + type: DatasetSourceReadTypeEnum; + sourceId: string; + isQAImport?: boolean; + selector?: string; +}): Promise => { + if (type === DatasetSourceReadTypeEnum.fileLocal) { + const { rawText } = await readFileContentFromMongo({ + teamId, + bucketName: BucketNameEnum.dataset, + fileId: sourceId, + isQAImport + }); + return rawText; + } else if (type === DatasetSourceReadTypeEnum.link) { + const result = await urlsFetch({ + urlList: [sourceId], + selector + }); + + return result[0]?.content || ''; + } else if (type === DatasetSourceReadTypeEnum.externalFile) { + const rawText = await readFileRawTextByUrl({ + teamId, + url: sourceId + }); + return rawText; + } + + return ''; +}; + +export const rawText2Chunks = ({ + rawText, + isQAImport, + chunkLen = 512, + ...splitProps +}: { + rawText: string; + isQAImport?: boolean; +} & TextSplitProps) => { + if (isQAImport) { + const { chunks } = parseCsvTable2Chunks(rawText); + return chunks; + } + + const { chunks } = splitText2Chunks({ + text: rawText, + chunkLen, + ...splitProps + }); + + return chunks.map((item) => ({ + q: item, + a: '' + })); +}; diff --git a/packages/service/core/workflow/dispatch/tools/http468.ts b/packages/service/core/workflow/dispatch/tools/http468.ts index 289cc053c..2075498dc 100644 --- a/packages/service/core/workflow/dispatch/tools/http468.ts +++ b/packages/service/core/workflow/dispatch/tools/http468.ts @@ -71,7 +71,7 @@ export const dispatchHttp468Request = async (props: HttpRequestProps): Promise { return JSON.stringify(value); } if (type === 'number') return Number(value); - if (type === 'boolean') return value === 'true' ? true : false; + if (type === 'boolean') { + if (typeof value === 'string') return value === 'true'; + return Boolean(value); + } try { if (type === WorkflowIOValueTypeEnum.datasetQuote && !Array.isArray(value)) { return JSON.parse(value); diff --git a/packages/service/package.json b/packages/service/package.json index 661eb8451..8f1f167ac 100644 --- a/packages/service/package.json +++ b/packages/service/package.json @@ -13,10 +13,10 @@ "decompress": "^4.2.1", "domino-ext": "^2.1.4", "encoding": "^0.1.13", + "fastgpt-js-tiktoken": "^1.0.12", "file-type": "^19.0.0", "iconv-lite": "^0.6.3", "joplin-turndown-plugin-gfm": "^1.0.12", - "js-tiktoken": "^1.0.7", "json5": "^2.2.3", "jsonwebtoken": "^9.0.2", "mammoth": "^1.6.0", diff --git a/packages/service/type.d.ts b/packages/service/type.d.ts index 281cae923..b3daebf9d 100644 --- a/packages/service/type.d.ts +++ b/packages/service/type.d.ts @@ -20,8 +20,9 @@ declare global { var whisperModel: WhisperModelType; var reRankModels: ReRankModelItemType[]; - var tiktokenWorker: { + var tiktokenWorkers: { + index: number; worker: Worker; callbackMap: Record void>; - }; + }[]; } diff --git a/packages/service/worker/file/extension/pdf.ts b/packages/service/worker/file/extension/pdf.ts index b6e43baf2..f68dfadeb 100644 --- a/packages/service/worker/file/extension/pdf.ts +++ b/packages/service/worker/file/extension/pdf.ts @@ -15,40 +15,45 @@ type TokenType = { export const readPdfFile = async ({ buffer }: ReadRawTextByBuffer): Promise => { const readPDFPage = async (doc: any, pageNo: number) => { - const page = await doc.getPage(pageNo); - const tokenizedText = await page.getTextContent(); + try { + const page = await doc.getPage(pageNo); + const tokenizedText = await page.getTextContent(); - const viewport = page.getViewport({ scale: 1 }); - const pageHeight = viewport.height; - const headerThreshold = pageHeight * 0.95; - const footerThreshold = pageHeight * 0.05; + const viewport = page.getViewport({ scale: 1 }); + const pageHeight = viewport.height; + const headerThreshold = pageHeight * 0.95; + const footerThreshold = pageHeight * 0.05; - const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => { - return ( - !token.transform || - (token.transform[5] < headerThreshold && token.transform[5] > footerThreshold) - ); - }); + const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => { + return ( + !token.transform || + (token.transform[5] < headerThreshold && token.transform[5] > footerThreshold) + ); + }); - // concat empty string 'hasEOL' - for (let i = 0; i < pageTexts.length; i++) { - const item = pageTexts[i]; - if (item.str === '' && pageTexts[i - 1]) { - pageTexts[i - 1].hasEOL = item.hasEOL; - pageTexts.splice(i, 1); - i--; + // concat empty string 'hasEOL' + for (let i = 0; i < pageTexts.length; i++) { + const item = pageTexts[i]; + if (item.str === '' && pageTexts[i - 1]) { + pageTexts[i - 1].hasEOL = item.hasEOL; + pageTexts.splice(i, 1); + i--; + } } + + page.cleanup(); + + return pageTexts + .map((token) => { + const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str); + + return paragraphEnd ? `${token.str}\n` : token.str; + }) + .join(''); + } catch (error) { + console.log('pdf read error', error); + return ''; } - - page.cleanup(); - - return pageTexts - .map((token) => { - const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str); - - return paragraphEnd ? `${token.str}\n` : token.str; - }) - .join(''); }; const loadingTask = pdfjs.getDocument(buffer.buffer); @@ -58,6 +63,7 @@ export const readPdfFile = async ({ buffer }: ReadRawTextByBuffer): Promise) => { case 'pptx': return readPptxRawText(params); case 'xlsx': - const xlsxResult = await readXlsxRawText(params); - if (params.csvFormat) { - return { - rawText: xlsxResult.formatText || '' - }; - } - return { - rawText: xlsxResult.rawText - }; + return readXlsxRawText(params); case 'csv': - const csvResult = await readCsvRawText(params); - if (params.csvFormat) { - return { - rawText: csvResult.formatText || '' - }; - } - return { - rawText: csvResult.rawText - }; + return readCsvRawText(params); default: return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx'); } diff --git a/packages/service/worker/file/type.d.ts b/packages/service/worker/file/type.d.ts index 0d136861d..41d5d125e 100644 --- a/packages/service/worker/file/type.d.ts +++ b/packages/service/worker/file/type.d.ts @@ -1,7 +1,6 @@ import { ReadFileByBufferParams } from '../../common/file/read/type'; export type ReadRawTextProps = { - csvFormat?: boolean; extension: string; buffer: T; encoding: string; diff --git a/packages/service/worker/tiktoken/countGptMessagesTokens.ts b/packages/service/worker/tiktoken/countGptMessagesTokens.ts index 80b13c318..1ef2b30e7 100644 --- a/packages/service/worker/tiktoken/countGptMessagesTokens.ts +++ b/packages/service/worker/tiktoken/countGptMessagesTokens.ts @@ -1,6 +1,6 @@ /* Only the token of gpt-3.5-turbo is used */ -import { Tiktoken } from 'js-tiktoken/lite'; -import encodingJson from './cl100k_base.json'; +import { Tiktoken } from 'fastgpt-js-tiktoken/lite'; +import cl100k_base from './cl100k_base.json'; import { ChatCompletionMessageParam, ChatCompletionContentPart, @@ -10,7 +10,7 @@ import { import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants'; import { parentPort } from 'worker_threads'; -const enc = new Tiktoken(encodingJson); +const enc = new Tiktoken(cl100k_base); /* count messages tokens */ parentPort?.on( diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a5fc2daf0..4e78857b2 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -126,6 +126,9 @@ importers: encoding: specifier: ^0.1.13 version: 0.1.13 + fastgpt-js-tiktoken: + specifier: ^1.0.12 + version: registry.npmjs.org/fastgpt-js-tiktoken@1.0.12 file-type: specifier: ^19.0.0 version: 19.0.0 @@ -135,9 +138,6 @@ importers: joplin-turndown-plugin-gfm: specifier: ^1.0.12 version: 1.0.12 - js-tiktoken: - specifier: ^1.0.7 - version: 1.0.7 json5: specifier: ^2.2.3 version: 2.2.3 @@ -155,7 +155,7 @@ importers: version: 1.4.5-lts.1 next: specifier: 13.5.2 - version: 13.5.2(@babel/core@7.24.4)(react-dom@18.2.0)(react@18.2.0)(sass@1.58.3) + version: 13.5.2(react-dom@18.2.0)(react@18.2.0) nextjs-cors: specifier: ^2.1.2 version: 2.1.2(next@13.5.2) @@ -8722,12 +8722,6 @@ packages: resolution: {integrity: sha512-dwXFwByc/ajSV6m5bcKAPwe4yDDF6D614pxmIi5odytzxRlwqF6nwoiCek80Ixc7Cvma5awClxrzFtxCQvcM8w==} dev: true - /js-tiktoken@1.0.7: - resolution: {integrity: sha512-biba8u/clw7iesNEWLOLwrNGoBP2lA+hTaBLs/D45pJdUPFXyxD6nhcDVtADChghv4GgyAiMKYMiRx7x6h7Biw==} - dependencies: - base64-js: 1.5.1 - dev: false - /js-tokens@4.0.0: resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==} @@ -9933,13 +9927,53 @@ packages: - '@babel/core' - babel-plugin-macros + /next@13.5.2(react-dom@18.2.0)(react@18.2.0): + resolution: {integrity: sha512-vog4UhUaMYAzeqfiAAmgB/QWLW7p01/sg+2vn6bqc/CxHFYizMzLv6gjxKzl31EVFkfl/F+GbxlKizlkTE9RdA==} + engines: {node: '>=16.14.0'} + hasBin: true + peerDependencies: + '@opentelemetry/api': ^1.1.0 + react: ^18.2.0 + react-dom: ^18.2.0 + sass: ^1.3.0 + peerDependenciesMeta: + '@opentelemetry/api': + optional: true + sass: + optional: true + dependencies: + '@next/env': 13.5.2 + '@swc/helpers': 0.5.2 + busboy: 1.6.0 + caniuse-lite: 1.0.30001603 + postcss: 8.4.14 + react: 18.2.0 + react-dom: 18.2.0(react@18.2.0) + styled-jsx: 5.1.1(react@18.2.0) + watchpack: 2.4.0 + zod: 3.21.4 + optionalDependencies: + '@next/swc-darwin-arm64': 13.5.2 + '@next/swc-darwin-x64': 13.5.2 + '@next/swc-linux-arm64-gnu': 13.5.2 + '@next/swc-linux-arm64-musl': 13.5.2 + '@next/swc-linux-x64-gnu': 13.5.2 + '@next/swc-linux-x64-musl': 13.5.2 + '@next/swc-win32-arm64-msvc': 13.5.2 + '@next/swc-win32-ia32-msvc': 13.5.2 + '@next/swc-win32-x64-msvc': 13.5.2 + transitivePeerDependencies: + - '@babel/core' + - babel-plugin-macros + dev: false + /nextjs-cors@2.1.2(next@13.5.2): resolution: {integrity: sha512-2yOVivaaf2ILe4f/qY32hnj3oC77VCOsUQJQfhVMGsXE/YMEWUY2zy78sH9FKUCM7eG42/l3pDofIzMD781XGA==} peerDependencies: next: ^8.1.1-canary.54 || ^9.0.0 || ^10.0.0-0 || ^11.0.0 || ^12.0.0 || ^13.0.0 dependencies: cors: 2.8.5 - next: 13.5.2(@babel/core@7.24.4)(react-dom@18.2.0)(react@18.2.0)(sass@1.58.3) + next: 13.5.2(react-dom@18.2.0)(react@18.2.0) dev: false /nextjs-node-loader@1.1.5(webpack@5.91.0): @@ -11725,6 +11759,23 @@ packages: client-only: 0.0.1 react: 18.2.0 + /styled-jsx@5.1.1(react@18.2.0): + resolution: {integrity: sha512-pW7uC1l4mBZ8ugbiZrcIsiIvVx1UmTfw7UkC3Um2tmfUq9Bhk8IiyEIPl6F8agHgjzku6j0xQEZbfA5uSgSaCw==} + engines: {node: '>= 12.0.0'} + peerDependencies: + '@babel/core': '*' + babel-plugin-macros: '*' + react: '>= 16.8.0 || 17.x.x || ^18.0.0-0' + peerDependenciesMeta: + '@babel/core': + optional: true + babel-plugin-macros: + optional: true + dependencies: + client-only: 0.0.1 + react: 18.2.0 + dev: false + /stylis@4.2.0: resolution: {integrity: sha512-Orov6g6BB1sDfYgzWfTHDOxamtX1bE/zo104Dh9e6fqJ3PooipYyfJ0pUmrZO2wAvO8YbEyeFrkV91XTsGMSrw==} dev: false @@ -12799,3 +12850,11 @@ packages: engines: {node: '>=0.8'} hasBin: true dev: false + + registry.npmjs.org/fastgpt-js-tiktoken@1.0.12: + resolution: {integrity: sha512-93UQM9h267PFQqnaJjcc+tqbKRZuipRbi+ASxVcE1FBzXOVb4GKfOMlsxXKCsSDdP+Luv8Fgul7F3HXKITXjYQ==, registry: https://registry.npmmirror.com/, tarball: https://registry.npmjs.org/fastgpt-js-tiktoken/-/fastgpt-js-tiktoken-1.0.12.tgz} + name: fastgpt-js-tiktoken + version: 1.0.12 + dependencies: + base64-js: 1.5.1 + dev: false diff --git a/projects/app/data/config.json b/projects/app/data/config.json index 30ceebdb5..1f8030643 100644 --- a/projects/app/data/config.json +++ b/projects/app/data/config.json @@ -6,7 +6,8 @@ "openapiPrefix": "fastgpt", "vectorMaxProcess": 15, "qaMaxProcess": 15, - "pgHNSWEfSearch": 100 + "pgHNSWEfSearch": 100, + "tokenWorkers": 20 }, "llmModels": [ { diff --git a/projects/app/src/global/core/dataset/api.d.ts b/projects/app/src/global/core/dataset/api.d.ts index e5fb7cf9f..6263f783f 100644 --- a/projects/app/src/global/core/dataset/api.d.ts +++ b/projects/app/src/global/core/dataset/api.d.ts @@ -1,6 +1,7 @@ import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api'; import { DatasetSearchModeEnum, + DatasetSourceReadTypeEnum, DatasetTypeEnum, ImportDataSourceEnum, TrainingModeEnum @@ -75,22 +76,3 @@ export type SearchTestResponse = { }; /* =========== training =========== */ -export type PostPreviewFilesChunksProps = { - type: ImportDataSourceEnum; - sourceId: string; - chunkSize: number; - overlapRatio: number; - customSplitChar?: string; -}; - -export type PostPreviewFilesChunksResponse = { - fileId: string; - rawTextLength: number; - chunks: string[]; -}[]; -export type PostPreviewTableChunksResponse = { - fileId: string; - totalChunks: number; - chunks: { q: string; a: string; chunkIndex: number }[]; - errorText?: string; -}[]; diff --git a/projects/app/src/middleware.ts b/projects/app/src/middleware.ts new file mode 100644 index 000000000..08aaf843d --- /dev/null +++ b/projects/app/src/middleware.ts @@ -0,0 +1,18 @@ +import { addLog } from '@fastgpt/service/common/system/log'; +import { NextResponse } from 'next/server'; +import type { NextRequest } from 'next/server'; + +export function middleware(request: NextRequest) { + const response = NextResponse.next(); + + addLog.info(`Request URL: ${request.url}`, { + body: request.body + }); + + return response; +} + +// See "Matching Paths" below to learn more +export const config = { + matcher: '/api/:path*' +}; diff --git a/projects/app/src/pages/api/common/file/previewContent.ts b/projects/app/src/pages/api/common/file/previewContent.ts index 3a4869377..42267447d 100644 --- a/projects/app/src/pages/api/common/file/previewContent.ts +++ b/projects/app/src/pages/api/common/file/previewContent.ts @@ -1,41 +1,50 @@ /* Read db file content and response 3000 words */ -import type { NextApiRequest, NextApiResponse } from 'next'; +import type { NextApiResponse } from 'next'; import { jsonRes } from '@fastgpt/service/common/response'; -import { connectToDatabase } from '@/service/mongo'; -import { readFileContentFromMongo } from '@fastgpt/service/common/file/gridfs/controller'; import { authFile } from '@fastgpt/service/support/permission/auth/file'; -import { BucketNameEnum } from '@fastgpt/global/common/file/constants'; +import { NextAPI } from '@/service/middle/entry'; +import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants'; +import { readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read'; +import { ApiRequestProps } from '@fastgpt/service/type/next'; +import { authCert } from '@fastgpt/service/support/permission/auth/common'; -export default async function handler(req: NextApiRequest, res: NextApiResponse) { - try { - await connectToDatabase(); - const { fileId, csvFormat } = req.body as { fileId: string; csvFormat?: boolean }; +export type PreviewContextProps = { + type: DatasetSourceReadTypeEnum; + sourceId: string; + isQAImport?: boolean; + selector?: string; +}; - if (!fileId) { - throw new Error('fileId is empty'); - } +async function handler(req: ApiRequestProps, res: NextApiResponse) { + const { type, sourceId, isQAImport, selector } = req.body; - const { teamId } = await authFile({ req, authToken: true, fileId }); - - const { rawText } = await readFileContentFromMongo({ - teamId, - bucketName: BucketNameEnum.dataset, - fileId, - csvFormat - }); - - jsonRes(res, { - data: { - previewContent: rawText.slice(0, 3000), - totalLength: rawText.length - } - }); - } catch (error) { - jsonRes(res, { - code: 500, - error - }); + if (!sourceId) { + throw new Error('fileId is empty'); } + + const { teamId } = await (async () => { + if (type === DatasetSourceReadTypeEnum.fileLocal) { + return authFile({ req, authToken: true, authApiKey: true, fileId: sourceId }); + } + return authCert({ req, authApiKey: true, authToken: true }); + })(); + + const rawText = await readDatasetSourceRawText({ + teamId, + type, + sourceId: sourceId, + isQAImport, + selector + }); + + jsonRes(res, { + data: { + previewContent: rawText.slice(0, 3000), + totalLength: rawText.length + } + }); } + +export default NextAPI(handler); diff --git a/projects/app/src/pages/api/core/ai/token.ts b/projects/app/src/pages/api/core/ai/token.ts new file mode 100644 index 000000000..34effcb4c --- /dev/null +++ b/projects/app/src/pages/api/core/ai/token.ts @@ -0,0 +1,41 @@ +import type { ApiRequestProps, ApiResponseType } from '@fastgpt/service/type/next'; +import { NextAPI } from '@/service/middle/entry'; +import { authCert } from '@fastgpt/service/support/permission/auth/common'; +import { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type'; +import { countGptMessagesTokens } from '@fastgpt/service/common/string/tiktoken'; + +export type tokenQuery = {}; + +export type tokenBody = { + messages: ChatCompletionMessageParam[]; +}; + +export type tokenResponse = {}; + +async function handler( + req: ApiRequestProps, + res: ApiResponseType +): Promise { + await authCert({ req, authRoot: true }); + const start = Date.now(); + + const tokens = await countGptMessagesTokens(req.body.messages); + + return { + tokens, + time: Date.now() - start, + + memory: process.memoryUsage() + }; +} + +export default NextAPI(handler); + +export const config = { + api: { + bodyParser: { + sizeLimit: '20mb' + }, + responseLimit: '20mb' + } +}; diff --git a/projects/app/src/pages/api/core/dataset/collection/create/csvTable.ts b/projects/app/src/pages/api/core/dataset/collection/create/csvTable.ts index e37890ca0..3f12b1492 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/csvTable.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/csvTable.ts @@ -19,6 +19,7 @@ import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants' import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model'; import { parseCsvTable2Chunks } from '@fastgpt/service/core/dataset/training/utils'; import { startTrainingQueue } from '@/service/core/dataset/training/utils'; +import { rawText2Chunks } from '@fastgpt/service/core/dataset/read'; export default async function handler(req: NextApiRequest, res: NextApiResponse) { const { datasetId, parentId, fileId } = req.body as FileIdCreateDatasetCollectionParams; @@ -39,10 +40,15 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< const { rawText, filename } = await readFileContentFromMongo({ teamId, bucketName: BucketNameEnum.dataset, - fileId + fileId, + isQAImport: true }); + console.log(rawText); // 2. split chunks - const { chunks = [] } = parseCsvTable2Chunks(rawText); + const chunks = rawText2Chunks({ + rawText, + isQAImport: true + }); // 3. auth limit await checkDatasetLimit({ diff --git a/projects/app/src/pages/api/core/dataset/collection/create/file.ts b/projects/app/src/pages/api/core/dataset/collection/create/file.ts index 2bc526f27..268510560 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/file.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/file.ts @@ -22,6 +22,7 @@ import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model'; import { hashStr } from '@fastgpt/global/common/string/tools'; import { startTrainingQueue } from '@/service/core/dataset/training/utils'; import { MongoRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/schema'; +import { rawText2Chunks } from '@fastgpt/service/core/dataset/read'; export default async function handler(req: NextApiRequest, res: NextApiResponse) { const { @@ -51,8 +52,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< fileId }); // 2. split chunks - const { chunks } = splitText2Chunks({ - text: rawText, + const chunks = rawText2Chunks({ + rawText, chunkLen: chunkSize, overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0, customReg: chunkSplitter ? [chunkSplitter] : [] @@ -110,8 +111,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< trainingMode: trainingType, prompt: qaPrompt, billId, - data: chunks.map((text, index) => ({ - q: text, + data: chunks.map((item, index) => ({ + ...item, chunkIndex: index })), session diff --git a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts index c60bf2542..1f6cf5181 100644 --- a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts +++ b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts @@ -1,79 +1,60 @@ -import type { NextApiRequest, NextApiResponse } from 'next'; -import { jsonRes } from '@fastgpt/service/common/response'; -import { connectToDatabase } from '@/service/mongo'; -import { BucketNameEnum } from '@fastgpt/global/common/file/constants'; +import type { NextApiResponse } from 'next'; import { authFile } from '@fastgpt/service/support/permission/auth/file'; -import { PostPreviewFilesChunksProps } from '@/global/core/dataset/api'; -import { readFileContentFromMongo } from '@fastgpt/service/common/file/gridfs/controller'; -import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; -import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants'; -import { parseCsvTable2Chunks } from '@fastgpt/service/core/dataset/training/utils'; +import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants'; +import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read'; +import { authCert } from '@fastgpt/service/support/permission/auth/common'; +import { NextAPI } from '@/service/middle/entry'; +import { ApiRequestProps } from '@fastgpt/service/type/next'; -export default async function handler(req: NextApiRequest, res: NextApiResponse) { - try { - await connectToDatabase(); +export type PostPreviewFilesChunksProps = { + type: DatasetSourceReadTypeEnum; + sourceId: string; + chunkSize: number; + overlapRatio: number; + customSplitChar?: string; + selector?: string; + isQAImport?: boolean; +}; +export type PreviewChunksResponse = { + q: string; + a: string; +}[]; - const { type, sourceId, chunkSize, customSplitChar, overlapRatio } = - req.body as PostPreviewFilesChunksProps; +async function handler( + req: ApiRequestProps, + res: NextApiResponse +): Promise { + const { type, sourceId, chunkSize, customSplitChar, overlapRatio, selector, isQAImport } = + req.body; - if (!sourceId) { - throw new Error('fileIdList is empty'); - } - if (chunkSize > 30000) { - throw new Error('chunkSize is too large, should be less than 30000'); - } - - const { chunks } = await (async () => { - if (type === ImportDataSourceEnum.fileLocal) { - const { file, teamId } = await authFile({ req, authToken: true, fileId: sourceId }); - const fileId = String(file._id); - - const { rawText } = await readFileContentFromMongo({ - teamId, - bucketName: BucketNameEnum.dataset, - fileId, - csvFormat: true - }); - // split chunks (5 chunk) - const { chunks } = splitText2Chunks({ - text: rawText, - chunkLen: chunkSize, - overlapRatio, - customReg: customSplitChar ? [customSplitChar] : [] - }); - - return { - chunks: chunks.map((item) => ({ - q: item, - a: '' - })) - }; - } - if (type === ImportDataSourceEnum.csvTable) { - const { file, teamId } = await authFile({ req, authToken: true, fileId: sourceId }); - const fileId = String(file._id); - const { rawText } = await readFileContentFromMongo({ - teamId, - bucketName: BucketNameEnum.dataset, - fileId, - csvFormat: false - }); - const { chunks } = parseCsvTable2Chunks(rawText); - - return { - chunks: chunks || [] - }; - } - return { chunks: [] }; - })(); - - jsonRes<{ q: string; a: string }[]>(res, { - data: chunks.slice(0, 5) - }); - } catch (error) { - jsonRes(res, { - code: 500, - error - }); + if (!sourceId) { + throw new Error('sourceId is empty'); } + if (chunkSize > 30000) { + throw new Error('chunkSize is too large, should be less than 30000'); + } + + const { teamId } = await (async () => { + if (type === DatasetSourceReadTypeEnum.fileLocal) { + return authFile({ req, authToken: true, authApiKey: true, fileId: sourceId }); + } + return authCert({ req, authApiKey: true, authToken: true }); + })(); + + const rawText = await readDatasetSourceRawText({ + teamId, + type, + sourceId: sourceId, + selector, + isQAImport + }); + + return rawText2Chunks({ + rawText, + chunkLen: chunkSize, + overlapRatio, + customReg: customSplitChar ? [customSplitChar] : [], + isQAImport: isQAImport + }).slice(0, 5); } +export default NextAPI(handler); diff --git a/projects/app/src/pages/app/list/index.tsx b/projects/app/src/pages/app/list/index.tsx index b1d6ef112..27700c723 100644 --- a/projects/app/src/pages/app/list/index.tsx +++ b/projects/app/src/pages/app/list/index.tsx @@ -16,8 +16,10 @@ import { useAppStore } from '@/web/core/app/store/useAppStore'; import PermissionIconText from '@/components/support/permission/IconText'; import { useUserStore } from '@/web/support/user/useUserStore'; import { useI18n } from '@/web/context/I18n'; +import { useTranslation } from 'next-i18next'; const MyApps = () => { + const { t } = useTranslation(); const { toast } = useToast(); const { appT, commonT } = useI18n(); @@ -46,12 +48,12 @@ const MyApps = () => { loadMyApps(true); } catch (err: any) { toast({ - title: err?.message || '删除失败', + title: err?.message || t('common.Delete Failed'), status: 'error' }); } }, - [toast, loadMyApps] + [toast, loadMyApps, t] ); /* 加载模型 */ diff --git a/projects/app/src/pages/dataset/detail/components/Import/components/PreviewChunks.tsx b/projects/app/src/pages/dataset/detail/components/Import/components/PreviewChunks.tsx index 65b30ce74..74dfb438f 100644 --- a/projects/app/src/pages/dataset/detail/components/Import/components/PreviewChunks.tsx +++ b/projects/app/src/pages/dataset/detail/components/Import/components/PreviewChunks.tsx @@ -10,6 +10,7 @@ import { useToast } from '@fastgpt/web/hooks/useToast'; import { getErrText } from '@fastgpt/global/common/error/utils'; import { useContextSelector } from 'use-context-selector'; import { DatasetImportContext } from '../Context'; +import { importType2ReadType } from '@fastgpt/global/core/dataset/read'; const PreviewChunks = ({ previewSource, @@ -27,19 +28,7 @@ const PreviewChunks = ({ const { data = [], isLoading } = useQuery( ['previewSource'], () => { - if ( - importSource === ImportDataSourceEnum.fileLocal || - importSource === ImportDataSourceEnum.csvTable || - importSource === ImportDataSourceEnum.fileLink - ) { - return getPreviewChunks({ - type: importSource, - sourceId: previewSource.dbFileId || previewSource.link || '', - chunkSize, - overlapRatio: chunkOverlapRatio, - customSplitChar: processParamsForm.getValues('customSplitChar') - }); - } else if (importSource === ImportDataSourceEnum.fileCustom) { + if (importSource === ImportDataSourceEnum.fileCustom) { const customSplitChar = processParamsForm.getValues('customSplitChar'); const { chunks } = splitText2Chunks({ text: previewSource.rawText || '', @@ -52,7 +41,27 @@ const PreviewChunks = ({ a: '' })); } - return []; + if (importSource === ImportDataSourceEnum.csvTable) { + return getPreviewChunks({ + type: importType2ReadType(importSource), + sourceId: previewSource.dbFileId || previewSource.link || previewSource.sourceUrl || '', + chunkSize, + overlapRatio: chunkOverlapRatio, + customSplitChar: processParamsForm.getValues('customSplitChar'), + selector: processParamsForm.getValues('webSelector'), + isQAImport: true + }); + } + + return getPreviewChunks({ + type: importType2ReadType(importSource), + sourceId: previewSource.dbFileId || previewSource.link || previewSource.sourceUrl || '', + chunkSize, + overlapRatio: chunkOverlapRatio, + customSplitChar: processParamsForm.getValues('customSplitChar'), + selector: processParamsForm.getValues('webSelector'), + isQAImport: false + }); }, { onError(err) { diff --git a/projects/app/src/pages/dataset/detail/components/Import/components/PreviewRawText.tsx b/projects/app/src/pages/dataset/detail/components/Import/components/PreviewRawText.tsx index 8c86d0b69..66fe4d597 100644 --- a/projects/app/src/pages/dataset/detail/components/Import/components/PreviewRawText.tsx +++ b/projects/app/src/pages/dataset/detail/components/Import/components/PreviewRawText.tsx @@ -9,6 +9,7 @@ import { useToast } from '@fastgpt/web/hooks/useToast'; import { getErrText } from '@fastgpt/global/common/error/utils'; import { useContextSelector } from 'use-context-selector'; import { DatasetImportContext } from '../Context'; +import { importType2ReadType } from '@fastgpt/global/core/dataset/read'; const PreviewRawText = ({ previewSource, @@ -18,32 +19,30 @@ const PreviewRawText = ({ onClose: () => void; }) => { const { toast } = useToast(); - const { importSource } = useContextSelector(DatasetImportContext, (v) => v); + const { importSource, processParamsForm } = useContextSelector(DatasetImportContext, (v) => v); const { data, isLoading } = useQuery( - ['previewSource', previewSource?.dbFileId], + ['previewSource', previewSource.dbFileId, previewSource.link, previewSource.sourceUrl], () => { - if (importSource === ImportDataSourceEnum.fileLocal && previewSource.dbFileId) { - return getPreviewFileContent({ - fileId: previewSource.dbFileId, - csvFormat: true - }); + if (importSource === ImportDataSourceEnum.fileCustom && previewSource.rawText) { + return { + previewContent: previewSource.rawText.slice(0, 3000) + }; } if (importSource === ImportDataSourceEnum.csvTable && previewSource.dbFileId) { return getPreviewFileContent({ - fileId: previewSource.dbFileId, - csvFormat: false + type: importType2ReadType(importSource), + sourceId: previewSource.dbFileId, + isQAImport: true }); } - if (importSource === ImportDataSourceEnum.fileCustom) { - return { - previewContent: (previewSource.rawText || '').slice(0, 3000) - }; - } - return { - previewContent: '' - }; + return getPreviewFileContent({ + type: importType2ReadType(importSource), + sourceId: previewSource.dbFileId || previewSource.link || previewSource.sourceUrl || '', + isQAImport: false, + selector: processParamsForm.getValues('webSelector') + }); }, { onError(err) { diff --git a/projects/app/src/pages/dataset/detail/components/Import/diffSource/ExternalFile.tsx b/projects/app/src/pages/dataset/detail/components/Import/diffSource/ExternalFile.tsx index f00f49bda..a67c49974 100644 --- a/projects/app/src/pages/dataset/detail/components/Import/diffSource/ExternalFile.tsx +++ b/projects/app/src/pages/dataset/detail/components/Import/diffSource/ExternalFile.tsx @@ -162,7 +162,7 @@ const CustomLinkInput = () => { {commonT('Add new')}