FastGPT/src/pages/api/model/data/splitData.ts
2023-04-03 00:37:40 +08:00

79 lines
1.9 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@/service/response';
import { connectToDatabase, SplitData, Model } from '@/service/mongo';
import { authToken } from '@/service/utils/tools';
import { generateQA } from '@/service/events/generateQA';
import { encode } from 'gpt-token-utils';
/* 拆分数据成QA */
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
const { text, modelId, prompt } = req.body as { text: string; modelId: string; prompt: string };
if (!text || !modelId || !prompt) {
throw new Error('参数错误');
}
await connectToDatabase();
const { authorization } = req.headers;
const userId = await authToken(authorization);
// 验证是否是该用户的 model
const model = await Model.findOne({
_id: modelId,
userId
});
if (!model) {
throw new Error('无权操作该模型');
}
const replaceText = text.replace(/(\\n|\n)+/g, ' ');
// 文本拆分成 chunk
let chunks = replaceText.match(/[^!?.。]+[!?.。]/g) || [];
const textList: string[] = [];
let splitText = '';
/* 取 3k ~ 4K tokens 内容 */
chunks.forEach((chunk) => {
const tokens = encode(splitText + chunk).length;
if (tokens >= 4000) {
// 超过 4000不要这块内容
textList.push(splitText);
splitText = chunk;
} else if (tokens >= 3000) {
// 超过 3000取内容
textList.push(splitText + chunk);
splitText = '';
} else {
//没超过 3000继续添加
splitText += chunk;
}
});
if (splitText) {
textList.push(splitText);
}
// 批量插入数据
await SplitData.create({
userId,
modelId,
rawText: text,
textList,
prompt
});
generateQA();
jsonRes(res);
} catch (err) {
jsonRes(res, {
code: 500,
error: err
});
}
}