Add image index and pdf parse (#3956)

* feat: think tag parse

* feat: parse think tag test

* feat: pdf parse ux

* feat: doc2x parse

* perf: rewrite training mode setting

* feat: image parse queue

* perf: image index

* feat: image parse process

* feat: add init sh

* fix: ts
This commit is contained in:
Archer 2025-03-03 23:08:29 +08:00 committed by archer
parent 08b6f594df
commit adf5377ebe
No known key found for this signature in database
GPG Key ID: 4446499B846D4A9E
106 changed files with 2337 additions and 1454 deletions

View File

@ -6,6 +6,7 @@ data:
"openapiPrefix": "fastgpt",
"vectorMaxProcess": 15,
"qaMaxProcess": 15,
"vlmMaxProcess": 15,
"pgHNSWEfSearch": 100
},
"llmModels": [

View File

@ -23,6 +23,7 @@ weight: 707
"systemEnv": {
"vectorMaxProcess": 15, // 向量处理线程数量
"qaMaxProcess": 15, // 问答拆分线程数量
"vlmMaxProcess": 15, // 图片理解模型最大处理进程
"tokenWorkers": 50, // Token 计算线程保持数,会持续占用内存,不能设置太大。
"pgHNSWEfSearch": 100 // 向量搜索参数。越大搜索越精确但是速度越慢。设置为100有99%+精度。
}

View File

@ -70,6 +70,7 @@ Mongo 数据库需要注意,需要注意在连接地址中增加 `directConnec
- `vectorMaxProcess`: 向量生成最大进程,根据数据库和 key 的并发数来决定,通常单个 120 号2c4g 服务器设置 10~15。
- `qaMaxProcess`: QA 生成最大进程
- `vlmMaxProcess`: 图片理解模型最大进程
- `pgHNSWEfSearch`: PostgreSQL vector 索引参数,越大搜索精度越高但是速度越慢,具体可看 pgvector 官方说明。
### 5. 运行

View File

@ -7,9 +7,18 @@ draft: false
images: []
---
## Copy文件
## 1. 停止服务
```bash
docker-compose down
```
## 2. Copy文件夹
Docker 部署数据库都会通过 volume 挂载本地的目录进入容器,如果要迁移,直接复制这些目录即可。
`PG 数据`: pg/data
`Mongo 数据`: mongo/data
`Mongo 数据`: mongo/data
直接把pg 和 mongo目录全部复制走即可。

View File

@ -297,7 +297,7 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/delete?
| --- | --- | --- |
| datasetId | 知识库ID | ✅ |
| parentId | 父级ID不填则默认为根目录 | |
| trainingType | 训练模式。chunk: 按文本长度进行分割;qa: QA拆分;auto: 增强训练 | ✅ |
| trainingType | 数据处理方式。chunk: 按文本长度进行分割;qa: 问答对提取 | ✅ |
| chunkSize | 预估块大小 | |
| chunkSplitter | 自定义最高优先分割符号 | |
| qaPrompt | qa拆分提示词 | |
@ -1079,7 +1079,7 @@ curl --location --request POST 'https://api.fastgpt.in/api/core/dataset/data/pus
--header 'Content-Type: application/json' \
--data-raw '{
    "collectionId": "64663f451ba1676dbdef0499",
"trainingMode": "chunk",
"trainingType": "chunk",
"prompt": "可选。qa 拆分引导词chunk 模式下忽略",
"billId": "可选。如果有这个值,本次的数据会被聚合到一个订单中,这个值可以重复使用。可以参考 [创建训练订单] 获取该值。",
    "data": [

View File

@ -0,0 +1,27 @@
---
title: 'V4.9.0(进行中)'
description: 'FastGPT V4.9.0 更新说明'
icon: 'upgrade'
draft: false
toc: true
weight: 803
---
## 重要更新
1. 弃用 - 弃用旧版本地文件上传 API/api/core/dataset/collection/create/file以前仅商业版可用的 API该接口已放切换成/api/core/dataset/collection/create/localFile
2. 停止维护,即将弃用 - 外部文件库相关 API可通过 API 文件库替代。
3. API更新 - 上传文件至知识库、创建连接集合、API 文件库、推送分块数据等接口,`trainingType`字段未来仅支持`chunk`和`QA`两种模式。增强索引模式将设置单独字段:`autoIndexes`,目前仍有适配旧版`trainingType=auto`代码,但请尽快变更成新接口类型。具体可见:[知识库 OpenAPI 文档](/docs/development/openapi/dataset.md)
## 🚀 新增内容
1. PDF增强解析交互添加到页面上。同时内嵌 Doc2x 服务,可直接使用 Doc2x 服务解析 PDF 文件。
2. 图片自动标注,同时修改知识库文件上传部分数据逻辑和交互。
## ⚙️ 优化
1. 知识库数据不再限制索引数量,可无限自定义。同时可自动更新输入文本的索引,不影响自定义索引。
## 🐛 修复

View File

@ -1,31 +0,0 @@
export const retryRun = <T>(fn: () => T, retry = 2): T => {
try {
return fn();
} catch (error) {
if (retry > 0) {
return retryRun(fn, retry - 1);
}
throw error;
}
};
export const batchRun = async <T>(arr: T[], fn: (arr: T) => any, batchSize = 10) => {
const batchArr = new Array(batchSize).fill(null);
const result: any[] = [];
const batchFn = async () => {
const data = arr.shift();
if (data) {
result.push(await fn(data));
return batchFn();
}
};
await Promise.all(
batchArr.map(async () => {
await batchFn();
})
);
return result;
};

View File

@ -1,4 +1,4 @@
import { batchRun } from '../fn/utils';
import { batchRun } from '../system/utils';
import { getNanoid, simpleText } from './tools';
import type { ImageType } from '../../../service/worker/readFile/type';
@ -37,6 +37,80 @@ export const simpleMarkdownText = (rawText: string) => {
return rawText.trim();
};
export const htmlTable2Md = (content: string): string => {
return content.replace(/<table>[\s\S]*?<\/table>/g, (htmlTable) => {
try {
// Clean up whitespace and newlines
const cleanHtml = htmlTable.replace(/\n\s*/g, '');
const rows = cleanHtml.match(/<tr>(.*?)<\/tr>/g);
if (!rows) return htmlTable;
// Parse table data
let tableData: string[][] = [];
let maxColumns = 0;
// Try to convert to markdown table
rows.forEach((row, rowIndex) => {
if (!tableData[rowIndex]) {
tableData[rowIndex] = [];
}
let colIndex = 0;
const cells = row.match(/<td.*?>(.*?)<\/td>/g) || [];
cells.forEach((cell) => {
while (tableData[rowIndex][colIndex]) {
colIndex++;
}
const colspan = parseInt(cell.match(/colspan="(\d+)"/)?.[1] || '1');
const rowspan = parseInt(cell.match(/rowspan="(\d+)"/)?.[1] || '1');
const content = cell.replace(/<td.*?>|<\/td>/g, '').trim();
for (let i = 0; i < rowspan; i++) {
for (let j = 0; j < colspan; j++) {
if (!tableData[rowIndex + i]) {
tableData[rowIndex + i] = [];
}
tableData[rowIndex + i][colIndex + j] = i === 0 && j === 0 ? content : '^^';
}
}
colIndex += colspan;
maxColumns = Math.max(maxColumns, colIndex);
});
for (let i = 0; i < maxColumns; i++) {
if (!tableData[rowIndex][i]) {
tableData[rowIndex][i] = ' ';
}
}
});
const chunks: string[] = [];
const headerCells = tableData[0]
.slice(0, maxColumns)
.map((cell) => (cell === '^^' ? ' ' : cell || ' '));
const headerRow = '| ' + headerCells.join(' | ') + ' |';
chunks.push(headerRow);
const separator = '| ' + Array(headerCells.length).fill('---').join(' | ') + ' |';
chunks.push(separator);
tableData.slice(1).forEach((row) => {
const paddedRow = row
.slice(0, maxColumns)
.map((cell) => (cell === '^^' ? ' ' : cell || ' '));
while (paddedRow.length < maxColumns) {
paddedRow.push(' ');
}
chunks.push('| ' + paddedRow.join(' | ') + ' |');
});
return chunks.join('\n');
} catch (error) {
return htmlTable;
}
});
};
/**
* format markdown
* 1. upload base64

View File

@ -43,10 +43,14 @@ export type FastGPTConfigFileType = {
export type FastGPTFeConfigsType = {
show_workorder?: boolean;
show_emptyChat?: boolean;
isPlus?: boolean;
register_method?: ['email' | 'phone' | 'sync'];
login_method?: ['email' | 'phone']; // Attention: login method is diffrent with oauth
find_password_method?: ['email' | 'phone'];
bind_notification_method?: ['email' | 'phone'];
googleClientVerKey?: string;
show_emptyChat?: boolean;
show_appStore?: boolean;
show_git?: boolean;
show_pay?: boolean;
@ -57,15 +61,19 @@ export type FastGPTFeConfigsType = {
show_aiproxy?: boolean;
concatMd?: string;
concatMd?: string;
docUrl?: string;
openAPIDocUrl?: string;
systemPluginCourseUrl?: string;
appTemplateCourse?: string;
customApiDomain?: string;
customSharePageDomain?: string;
systemTitle?: string;
systemDescription?: string;
googleClientVerKey?: string;
isPlus?: boolean;
scripts?: { [key: string]: string }[];
favicon?: string;
sso?: {
icon?: string;
title?: string;
@ -91,13 +99,14 @@ export type FastGPTFeConfigsType = {
exportDatasetLimitMinutes?: number;
websiteSyncLimitMinuted?: number;
};
scripts?: { [key: string]: string }[];
favicon?: string;
customApiDomain?: string;
customSharePageDomain?: string;
uploadFileMaxAmount?: number;
uploadFileMaxSize?: number;
// Compute by systemEnv.customPdfParse
showCustomPdfParse?: boolean;
customPdfParsePrice?: number;
lafEnv?: string;
navbarItems?: NavbarItemType[];
externalProviderWorkflowVariables?: ExternalProviderWorkflowVarType[];
@ -107,9 +116,18 @@ export type SystemEnvType = {
openapiPrefix?: string;
vectorMaxProcess: number;
qaMaxProcess: number;
vlmMaxProcess: number;
pgHNSWEfSearch: number;
tokenWorkers: number; // token count max worker
oneapiUrl?: string;
chatApiKey?: string;
customPdfParse?: {
url?: string;
key?: string;
doc2xKey?: string;
price?: number; // n points/1 page
};
};

View File

@ -16,3 +16,24 @@ export const retryFn = async <T>(fn: () => Promise<T>, retryTimes = 3): Promise<
return Promise.reject(error);
}
};
export const batchRun = async <T>(arr: T[], fn: (arr: T) => any, batchSize = 10) => {
const batchArr = new Array(batchSize).fill(null);
const result: any[] = [];
const batchFn = async () => {
const data = arr.shift();
if (data) {
result.push(await fn(data));
return batchFn();
}
};
await Promise.all(
batchArr.map(async () => {
await batchFn();
})
);
return result;
};

View File

@ -22,7 +22,7 @@ export const defaultQAModels: LLMModelItemType[] = [
maxTemperature: 1.2,
charsPointsPrice: 0,
censor: false,
vision: false,
vision: true,
datasetProcess: true,
toolChoice: true,
functionCall: false,
@ -59,10 +59,17 @@ export const defaultSTTModels: STTModelType[] = [
export const getModelFromList = (
modelList: { provider: ModelProviderIdType; name: string; model: string }[],
model: string
) => {
):
| {
avatar: string;
provider: ModelProviderIdType;
name: string;
model: string;
}
| undefined => {
const modelData = modelList.find((item) => item.model === model) ?? modelList[0];
if (!modelData) {
throw new Error('No Key model is configured');
return;
}
const provider = getModelProvider(modelData.provider);
return {

View File

@ -1,5 +1,5 @@
import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
import { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum } from './constants';
import type { LLMModelItemType } from '../ai/model.d';
import { ParentIdType } from 'common/parentFolder/type';
@ -10,9 +10,11 @@ export type DatasetUpdateBody = {
name?: string;
avatar?: string;
intro?: string;
agentModel?: LLMModelItemType;
status?: DatasetSchemaType['status'];
agentModel?: string;
vlmModel?: string;
websiteConfig?: DatasetSchemaType['websiteConfig'];
externalReadUrl?: DatasetSchemaType['externalReadUrl'];
defaultPermission?: DatasetSchemaType['defaultPermission'];
@ -27,7 +29,10 @@ export type DatasetUpdateBody = {
/* ================= collection ===================== */
export type DatasetCollectionChunkMetadataType = {
parentId?: string;
trainingType?: TrainingModeEnum;
customPdfParse?: boolean;
trainingType?: DatasetCollectionDataProcessModeEnum;
imageIndex?: boolean;
autoIndexes?: boolean;
chunkSize?: number;
chunkSplitter?: string;
qaPrompt?: string;
@ -131,9 +136,15 @@ export type PostWebsiteSyncParams = {
export type PushDatasetDataProps = {
collectionId: string;
data: PushDatasetDataChunkProps[];
trainingMode: TrainingModeEnum;
trainingType?: DatasetCollectionDataProcessModeEnum;
autoIndexes?: boolean;
imageIndex?: boolean;
prompt?: string;
billId?: string;
// Abandon
trainingMode?: DatasetCollectionDataProcessModeEnum;
};
export type PushDatasetDataResponse = {
insertLen: number;

View File

@ -1,4 +1,4 @@
import { DatasetCollectionTypeEnum, TrainingModeEnum, TrainingTypeMap } from '../constants';
import { DatasetCollectionTypeEnum } from '../constants';
import { DatasetCollectionSchemaType } from '../type';
export const getCollectionSourceData = (collection?: DatasetCollectionSchemaType) => {
@ -16,9 +16,3 @@ export const getCollectionSourceData = (collection?: DatasetCollectionSchemaType
export const checkCollectionIsFolder = (type: DatasetCollectionTypeEnum) => {
return type === DatasetCollectionTypeEnum.folder || type === DatasetCollectionTypeEnum.virtual;
};
export const getTrainingTypeLabel = (type?: TrainingModeEnum) => {
if (!type) return '';
if (!TrainingTypeMap[type]) return '';
return TrainingTypeMap[type].label;
};

View File

@ -109,6 +109,26 @@ export const DatasetCollectionSyncResultMap = {
}
};
export enum DatasetCollectionDataProcessModeEnum {
chunk = 'chunk',
qa = 'qa',
auto = 'auto' // abandon
}
export const DatasetCollectionDataProcessModeMap = {
[DatasetCollectionDataProcessModeEnum.chunk]: {
label: i18nT('common:core.dataset.training.Chunk mode'),
tooltip: i18nT('common:core.dataset.import.Chunk Split Tip')
},
[DatasetCollectionDataProcessModeEnum.qa]: {
label: i18nT('common:core.dataset.training.QA mode'),
tooltip: i18nT('common:core.dataset.import.QA Import Tip')
},
[DatasetCollectionDataProcessModeEnum.auto]: {
label: i18nT('common:core.dataset.training.Auto mode'),
tooltip: i18nT('common:core.dataset.training.Auto mode Tip')
}
};
/* ------------ data -------------- */
/* ------------ training -------------- */
@ -124,28 +144,11 @@ export enum ImportDataSourceEnum {
export enum TrainingModeEnum {
chunk = 'chunk',
qa = 'qa',
auto = 'auto',
qa = 'qa'
image = 'image'
}
export const TrainingTypeMap = {
[TrainingModeEnum.chunk]: {
label: i18nT('common:core.dataset.training.Chunk mode'),
tooltip: i18nT('common:core.dataset.import.Chunk Split Tip'),
openSource: true
},
[TrainingModeEnum.auto]: {
label: i18nT('common:core.dataset.training.Auto mode'),
tooltip: i18nT('common:core.dataset.training.Auto mode Tip'),
openSource: false
},
[TrainingModeEnum.qa]: {
label: i18nT('common:core.dataset.training.QA mode'),
tooltip: i18nT('common:core.dataset.import.QA Import Tip'),
openSource: true
}
};
/* ------------ search -------------- */
export enum DatasetSearchModeEnum {
embedding = 'embedding',

View File

@ -20,9 +20,22 @@ export type UpdateDatasetDataProps = {
})[];
};
export type PatchIndexesProps = {
type: 'create' | 'update' | 'delete' | 'unChange';
index: Omit<DatasetDataIndexItemType, 'dataId'> & {
dataId?: string;
};
};
export type PatchIndexesProps =
| {
type: 'create';
index: Omit<DatasetDataIndexItemType, 'dataId'> & {
dataId?: string;
};
}
| {
type: 'update';
index: DatasetDataIndexItemType;
}
| {
type: 'delete';
index: DatasetDataIndexItemType;
}
| {
type: 'unChange';
index: DatasetDataIndexItemType;
};

View File

@ -0,0 +1,42 @@
import { i18nT } from '../../../../web/i18n/utils';
export enum DatasetDataIndexTypeEnum {
default = 'default',
custom = 'custom',
summary = 'summary',
question = 'question',
image = 'image'
}
export const DatasetDataIndexMap: Record<
`${DatasetDataIndexTypeEnum}`,
{
label: any;
color: string;
}
> = {
[DatasetDataIndexTypeEnum.default]: {
label: i18nT('dataset:data_index_default'),
color: 'gray'
},
[DatasetDataIndexTypeEnum.custom]: {
label: i18nT('dataset:data_index_custom'),
color: 'blue'
},
[DatasetDataIndexTypeEnum.summary]: {
label: i18nT('dataset:data_index_summary'),
color: 'green'
},
[DatasetDataIndexTypeEnum.question]: {
label: i18nT('dataset:data_index_question'),
color: 'red'
},
[DatasetDataIndexTypeEnum.image]: {
label: i18nT('dataset:data_index_image'),
color: 'purple'
}
};
export const defaultDatasetIndexData = DatasetDataIndexMap[DatasetDataIndexTypeEnum.custom];
export const getDatasetIndexMapData = (type: `${DatasetDataIndexTypeEnum}`) => {
return DatasetDataIndexMap[type] || defaultDatasetIndexData;
};

View File

@ -0,0 +1,20 @@
import { PushDatasetDataChunkProps } from '../api';
import { TrainingModeEnum } from '../constants';
export type PushDataToTrainingQueueProps = {
teamId: string;
tmbId: string;
datasetId: string;
collectionId: string;
mode?: TrainingModeEnum;
data: PushDatasetDataChunkProps[];
prompt?: string;
agentModel: string;
vectorModel: string;
vlmModel?: string;
billId?: string;
session?: ClientSession;
};

View File

@ -2,6 +2,7 @@ import type { LLMModelItemType, EmbeddingModelItemType } from '../../core/ai/mod
import { PermissionTypeEnum } from '../../support/permission/constant';
import { PushDatasetDataChunkProps } from './api';
import {
DatasetCollectionDataProcessModeEnum,
DatasetCollectionTypeEnum,
DatasetStatusEnum,
DatasetTypeEnum,
@ -12,6 +13,7 @@ import { DatasetPermission } from '../../support/permission/dataset/controller';
import { Permission } from '../../support/permission/controller';
import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
import { SourceMemberType } from 'support/user/type';
import { DatasetDataIndexTypeEnum } from './data/constants';
export type DatasetSchemaType = {
_id: string;
@ -23,11 +25,14 @@ export type DatasetSchemaType = {
avatar: string;
name: string;
vectorModel: string;
agentModel: string;
intro: string;
type: `${DatasetTypeEnum}`;
status: `${DatasetStatusEnum}`;
vectorModel: string;
agentModel: string;
vlmModel?: string;
websiteConfig?: {
url: string;
selector: string;
@ -52,26 +57,22 @@ export type DatasetCollectionSchemaType = {
parentId?: string;
name: string;
type: DatasetCollectionTypeEnum;
createTime: Date;
updateTime: Date;
forbid?: boolean;
trainingType: TrainingModeEnum;
chunkSize: number;
chunkSplitter?: string;
qaPrompt?: string;
ocrParse?: boolean;
tags?: string[];
createTime: Date;
updateTime: Date;
// Status
forbid?: boolean;
nextSyncTime?: Date;
// Collection metadata
fileId?: string; // local file id
rawLink?: string; // link url
externalFileId?: string; //external file id
apiFileId?: string; // api file id
externalFileUrl?: string; // external import url
nextSyncTime?: Date;
rawTextLength?: number;
hashRawText?: string;
metadata?: {
@ -80,6 +81,16 @@ export type DatasetCollectionSchemaType = {
[key: string]: any;
};
// Parse settings
customPdfParse?: boolean;
// Chunk settings
autoIndexes?: boolean;
imageIndex?: boolean;
trainingType: DatasetCollectionDataProcessModeEnum;
chunkSize: number;
chunkSplitter?: string;
qaPrompt?: string;
};
export type DatasetCollectionTagsSchemaType = {
@ -90,7 +101,7 @@ export type DatasetCollectionTagsSchemaType = {
};
export type DatasetDataIndexItemType = {
defaultIndex: boolean;
type: `${DatasetDataIndexTypeEnum}`;
dataId: string; // pg data id
text: string;
};
@ -141,6 +152,7 @@ export type DatasetTrainingSchemaType = {
chunkIndex: number;
weight: number;
indexes: Omit<DatasetDataIndexItemType, 'dataId'>[];
retryCount: number;
};
export type CollectionWithDatasetType = DatasetCollectionSchemaType & {
@ -169,9 +181,10 @@ export type DatasetListItemType = {
sourceMember?: SourceMemberType;
};
export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel'> & {
export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel' | 'vlmModel'> & {
vectorModel: EmbeddingModelItemType;
agentModel: LLMModelItemType;
vlmModel?: LLMModelItemType;
permission: DatasetPermission;
};

View File

@ -1,6 +1,7 @@
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
import { getFileIcon } from '../../common/file/icon';
import { strIsLink } from '../../common/string/tools';
import { DatasetDataIndexTypeEnum } from './data/constants';
export function getCollectionIcon(
type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
@ -38,14 +39,23 @@ export function getSourceNameIcon({
}
/* get dataset data default index */
export function getDefaultIndex(props?: { q?: string; a?: string; dataId?: string }) {
const { q = '', a, dataId } = props || {};
const qaStr = `${q}\n${a}`.trim();
return {
defaultIndex: true,
text: a ? qaStr : q,
dataId
};
export function getDefaultIndex(props?: { q?: string; a?: string }) {
const { q = '', a } = props || {};
return [
{
text: q,
type: DatasetDataIndexTypeEnum.default
},
...(a
? [
{
text: a,
type: DatasetDataIndexTypeEnum.default
}
]
: [])
];
}
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {

View File

@ -10,7 +10,8 @@ export enum UsageSourceEnum {
wecom = 'wecom',
feishu = 'feishu',
dingtalk = 'dingtalk',
official_account = 'official_account'
official_account = 'official_account',
pdfParse = 'pdfParse'
}
export const UsageSourceMap = {
@ -43,5 +44,8 @@ export const UsageSourceMap = {
},
[UsageSourceEnum.dingtalk]: {
label: i18nT('account_usage:dingtalk')
},
[UsageSourceEnum.pdfParse]: {
label: i18nT('account_usage:pdf_parse')
}
};

View File

@ -7,6 +7,7 @@ export type UsageListItemCountType = {
outputTokens?: number;
charsLength?: number;
duration?: number;
pages?: number;
// deprecated
tokens?: number;

View File

@ -186,20 +186,25 @@ export async function getDownloadStream({
export const readFileContentFromMongo = async ({
teamId,
tmbId,
bucketName,
fileId,
isQAImport = false
isQAImport = false,
customPdfParse = false
}: {
teamId: string;
tmbId: string;
bucketName: `${BucketNameEnum}`;
fileId: string;
isQAImport?: boolean;
customPdfParse?: boolean;
}): Promise<{
rawText: string;
filename: string;
}> => {
const bufferId = `${fileId}-${customPdfParse}`;
// read buffer
const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: fileId }, undefined, {
const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: bufferId }, undefined, {
...readFromSecondary
}).lean();
if (fileBuffer) {
@ -227,9 +232,11 @@ export const readFileContentFromMongo = async ({
// Get raw text
const { rawText } = await readRawContentByFileBuffer({
customPdfParse,
extension,
isQAImport,
teamId,
tmbId,
buffer: fileBuffers,
encoding,
metadata: {
@ -240,7 +247,7 @@ export const readFileContentFromMongo = async ({
// < 14M
if (fileBuffers.length < 14 * 1024 * 1024 && rawText.trim()) {
MongoRawTextBuffer.create({
sourceId: fileId,
sourceId: bufferId,
rawText,
metadata: {
filename: file.filename

View File

@ -0,0 +1,27 @@
import axios from 'axios';
import { addLog } from '../../system/log';
import { serverRequestBaseUrl } from '../../api/serverRequest';
import { getFileContentTypeFromHeader, guessBase64ImageType } from '../utils';
export const getImageBase64 = async (url: string) => {
addLog.debug(`Load image to base64: ${url}`);
try {
const response = await axios.get(url, {
baseURL: serverRequestBaseUrl,
responseType: 'arraybuffer',
proxy: false
});
const base64 = Buffer.from(response.data, 'binary').toString('base64');
const imageType =
getFileContentTypeFromHeader(response.headers['content-type']) ||
guessBase64ImageType(base64);
return `data:${imageType};base64,${base64}`;
} catch (error) {
addLog.debug(`Load image to base64 failed: ${url}`);
console.log(error);
return Promise.reject(error);
}
};

View File

@ -1,18 +1,23 @@
import { uploadMongoImg } from '../image/controller';
import FormData from 'form-data';
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
import fs from 'fs';
import type { ReadFileResponse } from '../../../worker/readFile/type';
import type { ImageType, ReadFileResponse } from '../../../worker/readFile/type';
import axios from 'axios';
import { addLog } from '../../system/log';
import { batchRun } from '@fastgpt/global/common/fn/utils';
import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
import { batchRun } from '@fastgpt/global/common/system/utils';
import { htmlTable2Md, matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
import { getErrText } from '@fastgpt/global/common/error/utils';
import { delay } from '@fastgpt/global/common/system/utils';
import { getNanoid } from '@fastgpt/global/common/string/tools';
export type readRawTextByLocalFileParams = {
teamId: string;
tmbId: string;
path: string;
encoding: string;
customPdfParse?: boolean;
metadata?: Record<string, any>;
};
export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
@ -22,46 +27,51 @@ export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParam
const buffer = await fs.promises.readFile(path);
const { rawText } = await readRawContentByFileBuffer({
return readRawContentByFileBuffer({
extension,
isQAImport: false,
customPdfParse: params.customPdfParse,
teamId: params.teamId,
tmbId: params.tmbId,
encoding: params.encoding,
buffer,
metadata: params.metadata
});
return {
rawText
};
};
export const readRawContentByFileBuffer = async ({
extension,
isQAImport,
teamId,
tmbId,
extension,
buffer,
encoding,
metadata
metadata,
customPdfParse = false,
isQAImport = false
}: {
isQAImport?: boolean;
extension: string;
teamId: string;
tmbId: string;
extension: string;
buffer: Buffer;
encoding: string;
metadata?: Record<string, any>;
}) => {
// Custom read file service
const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL;
const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || '';
const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false';
const readFileFromCustomService = async (): Promise<ReadFileResponse | undefined> => {
if (
!customReadfileUrl ||
!customReadFileExtension ||
!customReadFileExtension.includes(extension)
)
return;
customPdfParse?: boolean;
isQAImport: boolean;
}): Promise<ReadFileResponse> => {
const systemParse = () =>
runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
extension,
encoding,
buffer,
teamId
});
const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => {
const url = global.systemEnv.customPdfParse?.url;
const token = global.systemEnv.customPdfParse?.key;
if (!url) return systemParse();
const start = Date.now();
addLog.info('Parsing files from an external service');
@ -70,20 +80,18 @@ export const readRawContentByFileBuffer = async ({
data.append('file', buffer, {
filename: `file.${extension}`
});
data.append('extension', extension);
data.append('ocr', ocrParse);
const { data: response } = await axios.post<{
success: boolean;
message: string;
data: {
page: number;
markdown: string;
duration: number;
};
}>(customReadfileUrl, data, {
}>(url, data, {
timeout: 600000,
headers: {
...data.getHeaders()
...data.getHeaders(),
Authorization: token ? `Bearer ${token}` : undefined
}
});
@ -92,21 +100,208 @@ export const readRawContentByFileBuffer = async ({
const rawText = response.data.markdown;
const { text, imageList } = matchMdImgTextAndUpload(rawText);
createPdfParseUsage({
teamId,
tmbId,
pages: response.data.page
});
return {
rawText: text,
formatText: rawText,
imageList
};
};
const parsePdfFromDoc2x = async (): Promise<ReadFileResponse> => {
const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey;
if (!doc2xKey) return systemParse();
let { rawText, formatText, imageList } =
(await readFileFromCustomService()) ||
(await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
extension,
encoding,
buffer,
teamId
}));
const parseTextImage = async (text: string) => {
// Extract image links and convert to base64
const imageList: { id: string; url: string }[] = [];
const processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => {
const id = getNanoid();
imageList.push({
id,
url
});
return `![](${id})`;
});
let resultImageList: ImageType[] = [];
await Promise.all(
imageList.map(async (item) => {
try {
const response = await axios.get(item.url, { responseType: 'arraybuffer' });
const mime = response.headers['content-type'] || 'image/jpeg';
const base64 = response.data.toString('base64');
resultImageList.push({
uuid: item.id,
mime,
base64
});
} catch (error) {
addLog.warn(`Failed to get image from ${item.url}: ${getErrText(error)}`);
}
})
);
return {
text: processedText,
imageList: resultImageList
};
};
let startTime = Date.now();
// 1. Get pre-upload URL first
const { data: preupload_data } = await axios
.post<{ code: string; data: { uid: string; url: string } }>(
'https://v2.doc2x.noedgeai.com/api/v2/parse/preupload',
null,
{
headers: {
Authorization: `Bearer ${doc2xKey}`
}
}
)
.catch((error) => {
return Promise.reject(
`[Pre-upload Error] Failed to get pre-upload URL: ${getErrText(error)}`
);
});
if (preupload_data?.code !== 'success') {
return Promise.reject(`Failed to get pre-upload URL: ${JSON.stringify(preupload_data)}`);
}
const upload_url = preupload_data.data.url;
const uid = preupload_data.data.uid;
// 2. Upload file to pre-signed URL with binary stream
const blob = new Blob([buffer], { type: 'application/pdf' });
const response = await axios
.put(upload_url, blob, {
headers: {
'Content-Type': 'application/pdf'
}
})
.catch((error) => {
return Promise.reject(`[Upload Error] Failed to upload file: ${getErrText(error)}`);
});
if (response.status !== 200) {
return Promise.reject(`Upload failed with status ${response.status}: ${response.statusText}`);
}
await delay(5000);
addLog.debug(`Uploaded file to Doc2x, uid: ${uid}`);
// 3. Get the result by uid
const checkResult = async (retry = 30) => {
if (retry <= 0) {
return Promise.reject(
`[Parse Timeout Error] Failed to get result (uid: ${uid}): Process timeout`
);
}
try {
const { data: result_data } = await axios
.get<{
code: string;
data: {
progress: number;
status: 'processing' | 'failed' | 'success';
result: {
pages: {
md: string;
}[];
};
};
}>(`https://v2.doc2x.noedgeai.com/api/v2/parse/status?uid=${uid}`, {
headers: {
Authorization: `Bearer ${doc2xKey}`
}
})
.catch((error) => {
return Promise.reject(
`[Parse Status Error] Failed to get parse status: ${getErrText(error)}`
);
});
// Error
if (!['ok', 'success'].includes(result_data.code)) {
return Promise.reject(
`Failed to get result (uid: ${uid}): ${JSON.stringify(result_data)}`
);
}
// Process
if (['ready', 'processing'].includes(result_data.data.status)) {
addLog.debug(`Waiting for the result, uid: ${uid}`);
await delay(5000);
return checkResult(retry - 1);
}
// Finifsh
if (result_data.data.status === 'success') {
const result = result_data.data.result.pages
.map((page) => page.md)
.join('\n')
// Do some post-processing
.replace(/\\[\(\)]/g, '$')
.replace(/\\[\[\]]/g, '$$')
.replace(/<img\s+src="([^"]+)"(?:\s*\?[^>]*)?(?:\s*\/>|>)/g, '![img]($1)')
.replace(/<!-- Media -->/g, '')
.replace(/<!-- Footnote -->/g, '')
.replace(/\$(.+?)\s+\\tag\{(.+?)\}\$/g, '$$$1 \\qquad \\qquad ($2)$$')
.replace(/\\text\{([^}]*?)(\b\w+)_(\w+\b)([^}]*?)\}/g, '\\text{$1$2\\_$3$4}');
const { text, imageList } = await parseTextImage(htmlTable2Md(result));
return {
pages: result_data.data.result.pages.length,
text,
imageList
};
}
return checkResult(retry - 1);
} catch (error) {
if (retry > 1) {
await delay(100);
return checkResult(retry - 1);
}
return Promise.reject(error);
}
};
const { pages, text, imageList } = await checkResult();
createPdfParseUsage({
teamId,
tmbId,
pages
});
addLog.info(`Doc2x parse success, time: ${Date.now() - startTime}ms`);
return {
rawText: text,
formatText: text,
imageList
};
};
// Custom read file service
const pdfParseFn = async (): Promise<ReadFileResponse> => {
if (!customPdfParse) return systemParse();
if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService();
if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x();
return systemParse();
};
let { rawText, formatText, imageList } = await (async () => {
if (extension === 'pdf') {
return await pdfParseFn();
}
return await systemParse();
})();
// markdown data format
if (imageList) {
@ -142,5 +337,5 @@ export const readRawContentByFileBuffer = async ({
}
}
return { rawText };
return { rawText, formatText, imageList };
};

View File

@ -10,6 +10,11 @@ export const SERVICE_LOCAL_HOST =
export const initFastGPTConfig = (config?: FastGPTConfigFileType) => {
if (!config) return;
// Special config computed
config.feConfigs.showCustomPdfParse =
!!config.systemEnv.customPdfParse?.url || !!config.systemEnv.customPdfParse?.doc2xKey;
config.feConfigs.customPdfParsePrice = config.systemEnv.customPdfParse?.price || 0;
global.feConfigs = config.feConfigs;
global.systemEnv = config.systemEnv;
global.subPlans = config.subPlans;

View File

@ -13,6 +13,11 @@ export const getDatasetModel = (model?: string) => {
?.find((item) => item.model === model || item.name === model) ?? getDefaultLLMModel()
);
};
export const getVlmModel = (model?: string) => {
return Array.from(global.llmModelMap.values())
?.filter((item) => item.vision)
?.find((item) => item.model === model || item.name === model);
};
export const getDefaultEmbeddingModel = () => global?.systemDefaultModel.embedding!;
export const getEmbeddingModel = (model?: string) => {

View File

@ -9,10 +9,9 @@ import type {
} from '@fastgpt/global/core/ai/type.d';
import axios from 'axios';
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
import { getFileContentTypeFromHeader, guessBase64ImageType } from '../../common/file/utils';
import { serverRequestBaseUrl } from '../../common/api/serverRequest';
import { i18nT } from '../../../web/i18n/utils';
import { addLog } from '../../common/system/log';
import { getImageBase64 } from '../../common/file/image/utils';
export const filterGPTMessageByMaxContext = async ({
messages = [],
@ -166,25 +165,13 @@ export const loadRequestMessages = async ({
try {
// If imgUrl is a local path, load image from local, and set url to base64
if (imgUrl.startsWith('/') || process.env.MULTIPLE_DATA_TO_BASE64 === 'true') {
addLog.debug('Load image from local server', {
baseUrl: serverRequestBaseUrl,
requestUrl: imgUrl
});
const response = await axios.get(imgUrl, {
baseURL: serverRequestBaseUrl,
responseType: 'arraybuffer',
proxy: false
});
const base64 = Buffer.from(response.data, 'binary').toString('base64');
const imageType =
getFileContentTypeFromHeader(response.headers['content-type']) ||
guessBase64ImageType(base64);
const base64 = await getImageBase64(imgUrl);
return {
...item,
image_url: {
...item.image_url,
url: `data:${imageType};base64,${base64}`
url: base64
}
};
}
@ -223,7 +210,8 @@ export const loadRequestMessages = async ({
await Promise.all(
content.map(async (item) => {
if (item.type === 'text') {
if (item.text) return parseStringWithImages(item.text);
// If it is array, not need to parse image
if (item.text) return item;
return;
}
if (item.type === 'file_url') return; // LLM not support file_url

View File

@ -108,7 +108,15 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer }
return formattedFiles;
};
const getFileContent = async ({ teamId, apiFileId }: { teamId: string; apiFileId: string }) => {
const getFileContent = async ({
teamId,
tmbId,
apiFileId
}: {
teamId: string;
tmbId: string;
apiFileId: string;
}) => {
const data = await request<APIFileContentResponse>(
`/v1/file/content`,
{ id: apiFileId },
@ -123,6 +131,7 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer }
if (previewUrl) {
const rawText = await readFileRawTextByUrl({
teamId,
tmbId,
url: previewUrl,
relatedId: apiFileId
});

View File

@ -1,6 +1,6 @@
import {
DatasetCollectionTypeEnum,
TrainingModeEnum
DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants';
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { MongoDatasetCollection } from './schema';
@ -19,13 +19,14 @@ import { predictDataLimitLength } from '../../../../global/core/dataset/utils';
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
import { createTrainingUsage } from '../../../support/wallet/usage/controller';
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
import { getLLMModel, getEmbeddingModel } from '../../ai/model';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
import { pushDataListToTrainingQueue } from '../training/controller';
import { MongoImage } from '../../../common/file/image/schema';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { addDays } from 'date-fns';
import { MongoDatasetDataText } from '../data/dataTextSchema';
import { delay, retryFn } from '@fastgpt/global/common/system/utils';
import { retryFn } from '@fastgpt/global/common/system/utils';
import { getTrainingModeByCollection } from './utils';
export const createCollectionAndInsertData = async ({
dataset,
@ -33,6 +34,7 @@ export const createCollectionAndInsertData = async ({
relatedId,
createCollectionParams,
isQAImport = false,
billId,
session
}: {
dataset: DatasetSchemaType;
@ -41,13 +43,21 @@ export const createCollectionAndInsertData = async ({
createCollectionParams: CreateOneCollectionParams;
isQAImport?: boolean;
billId?: string;
session?: ClientSession;
}) => {
// Adapter 4.9.0
if (createCollectionParams.trainingType === DatasetCollectionDataProcessModeEnum.auto) {
createCollectionParams.trainingType = DatasetCollectionDataProcessModeEnum.chunk;
createCollectionParams.autoIndexes = true;
}
const teamId = createCollectionParams.teamId;
const tmbId = createCollectionParams.tmbId;
// Chunk split params
const trainingType = createCollectionParams.trainingType || TrainingModeEnum.chunk;
const chunkSize = createCollectionParams.chunkSize;
const trainingType =
createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
const chunkSize = createCollectionParams.chunkSize || 512;
const chunkSplitter = createCollectionParams.chunkSplitter;
const qaPrompt = createCollectionParams.qaPrompt;
const usageName = createCollectionParams.name;
@ -56,7 +66,7 @@ export const createCollectionAndInsertData = async ({
const chunks = rawText2Chunks({
rawText,
chunkLen: chunkSize,
overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0,
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: chunkSplitter ? [chunkSplitter] : [],
isQAImport
});
@ -64,7 +74,14 @@ export const createCollectionAndInsertData = async ({
// 2. auth limit
await checkDatasetLimit({
teamId,
insertLen: predictDataLimitLength(trainingType, chunks)
insertLen: predictDataLimitLength(
getTrainingModeByCollection({
trainingType,
autoIndexes: createCollectionParams.autoIndexes,
imageIndex: createCollectionParams.imageIndex
}),
chunks
)
});
const fn = async (session: ClientSession) => {
@ -89,15 +106,20 @@ export const createCollectionAndInsertData = async ({
});
// 4. create training bill
const { billId } = await createTrainingUsage({
teamId,
tmbId,
appName: usageName,
billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name,
session
});
const traingBillId = await (async () => {
if (billId) return billId;
const { billId: newBillId } = await createTrainingUsage({
teamId,
tmbId,
appName: usageName,
billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name,
vllmModel: getVlmModel(dataset.vlmModel)?.name,
session
});
return newBillId;
})();
// 5. insert to training queue
const insertResults = await pushDataListToTrainingQueue({
@ -107,9 +129,14 @@ export const createCollectionAndInsertData = async ({
collectionId,
agentModel: dataset.agentModel,
vectorModel: dataset.vectorModel,
trainingMode: trainingType,
vlmModel: dataset.vlmModel,
mode: getTrainingModeByCollection({
trainingType,
autoIndexes: createCollectionParams.autoIndexes,
imageIndex: createCollectionParams.imageIndex
}),
prompt: qaPrompt,
billId,
billId: traingBillId,
data: chunks.map((item, index) => ({
...item,
chunkIndex: index
@ -161,10 +188,15 @@ export async function createOneCollection({
datasetId,
type,
trainingType = TrainingModeEnum.chunk,
chunkSize = 512,
chunkSplitter,
qaPrompt,
createTime,
updateTime,
hashRawText,
rawTextLength,
metadata = {},
tags,
nextSyncTime,
fileId,
rawLink,
@ -172,15 +204,18 @@ export async function createOneCollection({
externalFileUrl,
apiFileId,
hashRawText,
rawTextLength,
metadata = {},
session,
tags,
// Parse settings
customPdfParse,
imageIndex,
createTime,
updateTime,
nextSyncTime
// Chunk settings
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
autoIndexes,
chunkSize = 512,
chunkSplitter,
qaPrompt,
session
}: CreateOneCollectionParams) {
// Create collection tags
const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
@ -196,25 +231,31 @@ export async function createOneCollection({
name,
type,
trainingType,
chunkSize,
chunkSplitter,
qaPrompt,
rawTextLength,
hashRawText,
tags: collectionTags,
metadata,
createTime,
updateTime,
nextSyncTime,
...(fileId ? { fileId } : {}),
...(rawLink ? { rawLink } : {}),
...(externalFileId ? { externalFileId } : {}),
...(externalFileUrl ? { externalFileUrl } : {}),
...(apiFileId ? { apiFileId } : {}),
rawTextLength,
hashRawText,
tags: collectionTags,
// Parse settings
customPdfParse,
imageIndex,
createTime,
updateTime,
nextSyncTime
// Chunk settings
trainingType,
autoIndexes,
chunkSize,
chunkSplitter,
qaPrompt
}
],
{ session, ordered: true }

View File

@ -1,7 +1,10 @@
import { connectionMongo, getMongoModel } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
import {
DatasetCollectionTypeMap,
DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { DatasetCollectionName } from '../schema';
import {
TeamCollectionName,
@ -31,6 +34,8 @@ const DatasetCollectionSchema = new Schema({
ref: DatasetCollectionName,
required: true
},
// Basic info
type: {
type: String,
enum: Object.keys(DatasetCollectionTypeMap),
@ -40,6 +45,11 @@ const DatasetCollectionSchema = new Schema({
type: String,
required: true
},
tags: {
type: [String],
default: []
},
createTime: {
type: Date,
default: () => new Date()
@ -48,33 +58,8 @@ const DatasetCollectionSchema = new Schema({
type: Date,
default: () => new Date()
},
forbid: {
type: Boolean,
default: false
},
// chunk filed
trainingType: {
type: String,
enum: Object.keys(TrainingTypeMap)
},
chunkSize: {
type: Number,
required: true
},
chunkSplitter: {
type: String
},
qaPrompt: {
type: String
},
ocrParse: Boolean,
tags: {
type: [String],
default: []
},
// Metadata
// local file collection
fileId: {
type: Schema.Types.ObjectId,
@ -82,22 +67,39 @@ const DatasetCollectionSchema = new Schema({
},
// web link collection
rawLink: String,
// api collection
// Api collection
apiFileId: String,
// external collection
// external collection(Abandoned)
externalFileId: String,
externalFileUrl: String, // external import url
// next sync time
nextSyncTime: Date,
// metadata
rawTextLength: Number,
hashRawText: String,
metadata: {
type: Object,
default: {}
}
},
forbid: Boolean,
// next sync time
nextSyncTime: Date,
// Parse settings
customPdfParse: Boolean,
// Chunk settings
imageIndex: Boolean,
autoIndexes: Boolean,
trainingType: {
type: String,
enum: Object.values(DatasetCollectionDataProcessModeEnum)
},
chunkSize: {
type: Number,
required: true
},
chunkSplitter: String,
qaPrompt: String
});
DatasetCollectionSchema.virtual('dataset', {

View File

@ -2,12 +2,17 @@ import { MongoDatasetCollection } from './schema';
import { ClientSession } from '../../../common/mongo';
import { MongoDatasetCollectionTags } from '../tag/schema';
import { readFromSecondary } from '../../../common/mongo/utils';
import { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type';
import {
CollectionWithDatasetType,
DatasetCollectionSchemaType
} from '@fastgpt/global/core/dataset/type';
import {
DatasetCollectionDataProcessModeEnum,
DatasetCollectionSyncResultEnum,
DatasetCollectionTypeEnum,
DatasetSourceReadTypeEnum,
DatasetTypeEnum
DatasetTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
import { readDatasetSourceRawText } from '../read';
@ -160,6 +165,7 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
})();
const rawText = await readDatasetSourceRawText({
teamId: collection.teamId,
tmbId: collection.tmbId,
...sourceReadType
});
@ -220,3 +226,24 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
return DatasetCollectionSyncResultEnum.success;
};
/*
QA: 独立进程
Chunk: Image Index -> Auto index -> chunk index
*/
export const getTrainingModeByCollection = (collection: {
trainingType: DatasetCollectionSchemaType['trainingType'];
autoIndexes?: DatasetCollectionSchemaType['autoIndexes'];
imageIndex?: DatasetCollectionSchemaType['imageIndex'];
}) => {
if (collection.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return TrainingModeEnum.qa;
}
if (collection.imageIndex && global.feConfigs?.isPlus) {
return TrainingModeEnum.image;
}
if (collection.autoIndexes && global.feConfigs?.isPlus) {
return TrainingModeEnum.auto;
}
return TrainingModeEnum.chunk;
};

View File

@ -7,6 +7,7 @@ import {
} from '@fastgpt/global/support/user/team/constant';
import { DatasetCollectionName } from '../schema';
import { DatasetColCollectionName } from '../collection/schema';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
export const DatasetDataCollectionName = 'dataset_datas';
@ -42,10 +43,16 @@ const DatasetDataSchema = new Schema({
indexes: {
type: [
{
// Abandon
defaultIndex: {
type: Boolean,
default: false
},
type: {
type: String,
enum: Object.values(DatasetDataIndexTypeEnum),
default: DatasetDataIndexTypeEnum.custom
},
dataId: {
type: String,
required: true

View File

@ -13,11 +13,15 @@ import { POST } from '../../common/api/plusRequest';
export const readFileRawTextByUrl = async ({
teamId,
tmbId,
url,
customPdfParse,
relatedId
}: {
teamId: string;
tmbId: string;
url: string;
customPdfParse?: boolean;
relatedId: string; // externalFileId / apiFileId
}) => {
const response = await axios({
@ -30,8 +34,11 @@ export const readFileRawTextByUrl = async ({
const buffer = Buffer.from(response.data, 'binary');
const { rawText } = await readRawContentByFileBuffer({
customPdfParse,
isQAImport: false,
extension,
teamId,
tmbId,
buffer,
encoding: 'utf-8',
metadata: {
@ -49,6 +56,7 @@ export const readFileRawTextByUrl = async ({
*/
export const readDatasetSourceRawText = async ({
teamId,
tmbId,
type,
sourceId,
isQAImport,
@ -56,11 +64,14 @@ export const readDatasetSourceRawText = async ({
externalFileId,
apiServer,
feishuServer,
yuqueServer
yuqueServer,
customPdfParse
}: {
teamId: string;
tmbId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
customPdfParse?: boolean;
isQAImport?: boolean; // csv data
selector?: string; // link selector
@ -72,9 +83,11 @@ export const readDatasetSourceRawText = async ({
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const { rawText } = await readFileContentFromMongo({
teamId,
tmbId,
bucketName: BucketNameEnum.dataset,
fileId: sourceId,
isQAImport
isQAImport,
customPdfParse
});
return rawText;
} else if (type === DatasetSourceReadTypeEnum.link) {
@ -88,8 +101,10 @@ export const readDatasetSourceRawText = async ({
if (!externalFileId) return Promise.reject('FileId not found');
const rawText = await readFileRawTextByUrl({
teamId,
tmbId,
url: sourceId,
relatedId: externalFileId
relatedId: externalFileId,
customPdfParse
});
return rawText;
} else if (type === DatasetSourceReadTypeEnum.apiFile) {
@ -98,7 +113,8 @@ export const readDatasetSourceRawText = async ({
feishuServer,
yuqueServer,
apiFileId: sourceId,
teamId
teamId,
tmbId
});
return rawText;
}
@ -110,16 +126,18 @@ export const readApiServerFileContent = async ({
feishuServer,
yuqueServer,
apiFileId,
teamId
teamId,
tmbId
}: {
apiServer?: APIFileServer;
feishuServer?: FeishuServer;
yuqueServer?: YuqueServer;
apiFileId: string;
teamId: string;
tmbId: string;
}) => {
if (apiServer) {
return useApiDatasetRequest({ apiServer }).getFileContent({ teamId, apiFileId });
return useApiDatasetRequest({ apiServer }).getFileContent({ teamId, tmbId, apiFileId });
}
if (feishuServer || yuqueServer) {

View File

@ -67,6 +67,7 @@ const DatasetSchema = new Schema({
required: true,
default: 'gpt-4o-mini'
},
vlmModel: String,
intro: {
type: String,
default: ''

View File

@ -1,16 +1,16 @@
import { MongoDatasetTraining } from './schema';
import type {
PushDatasetDataChunkProps,
PushDatasetDataProps,
PushDatasetDataResponse
} from '@fastgpt/global/core/dataset/api.d';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { simpleText } from '@fastgpt/global/common/string/tools';
import { ClientSession } from '../../../common/mongo';
import { getLLMModel, getEmbeddingModel } from '../../ai/model';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
import { addLog } from '../../../common/system/log';
import { getCollectionWithDataset } from '../controller';
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
try {
@ -28,20 +28,17 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
export const pushDataListToTrainingQueueByCollectionId = async ({
collectionId,
...props
}: {
teamId: string;
tmbId: string;
session?: ClientSession;
} & PushDatasetDataProps) => {
}: Omit<PushDataToTrainingQueueProps, 'datasetId' | 'agentModel' | 'vectorModel' | 'vlmModel'>) => {
const {
dataset: { _id: datasetId, agentModel, vectorModel }
dataset: { _id: datasetId, agentModel, vectorModel, vlmModel }
} = await getCollectionWithDataset(collectionId);
return pushDataListToTrainingQueue({
...props,
datasetId,
collectionId,
vectorModel,
agentModel,
vectorModel
vlmModel
});
};
@ -52,30 +49,30 @@ export async function pushDataListToTrainingQueue({
collectionId,
agentModel,
vectorModel,
vlmModel,
data,
prompt,
billId,
trainingMode = TrainingModeEnum.chunk,
mode = TrainingModeEnum.chunk,
session
}: {
teamId: string;
tmbId: string;
datasetId: string;
agentModel: string;
vectorModel: string;
session?: ClientSession;
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
if (mode !== TrainingModeEnum.image) return mode;
// 检查内容中,是否包含 ![](xxx) 的图片格式
const text = data.q + data.a || '';
const regex = /!\[\]\((.*?)\)/g;
const match = text.match(regex);
if (match) {
return TrainingModeEnum.image;
}
return mode;
};
const { model, maxToken, weight } = await (async () => {
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(`File model ${agentModel} is inValid`);
}
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(`Vector model ${vectorModel} is inValid`);
}
if (trainingMode === TrainingModeEnum.chunk) {
if (mode === TrainingModeEnum.chunk) {
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(`Vector model ${vectorModel} is inValid`);
}
return {
maxToken: vectorModelData.maxToken * 1.5,
model: vectorModelData.model,
@ -83,7 +80,11 @@ export async function pushDataListToTrainingQueue({
};
}
if (trainingMode === TrainingModeEnum.qa || trainingMode === TrainingModeEnum.auto) {
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(`File model ${agentModel} is inValid`);
}
return {
maxToken: agentModelData.maxContext * 0.8,
model: agentModelData.model,
@ -91,8 +92,24 @@ export async function pushDataListToTrainingQueue({
};
}
return Promise.reject(`Training mode "${trainingMode}" is inValid`);
if (mode === TrainingModeEnum.image) {
const vllmModelData = getVlmModel(vlmModel);
if (!vllmModelData) {
return Promise.reject(`Vlm model ${vlmModel} is inValid`);
}
return {
maxToken: vllmModelData.maxContext * 0.8,
model: vllmModelData.model,
weight: 0
};
}
return Promise.reject(`Training mode "${mode}" is inValid`);
})();
// Filter redundant params
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
prompt = undefined;
}
// filter repeat or equal content
const set = new Set();
@ -158,7 +175,7 @@ export async function pushDataListToTrainingQueue({
datasetId,
collectionId,
billId,
mode: trainingMode,
mode: getImageChunkMode(item, mode),
prompt,
model,
q: item.q,

View File

@ -1,14 +1,15 @@
/* 模型的知识库 */
import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { connectionMongo, getMongoModel } from '../../../common/mongo';
const { Schema } = connectionMongo;
import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
import { TrainingTypeMap } from '@fastgpt/global/core/dataset/constants';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { DatasetColCollectionName } from '../collection/schema';
import { DatasetCollectionName } from '../schema';
import {
TeamCollectionName,
TeamMemberCollectionName
} from '@fastgpt/global/support/user/team/constant';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
export const DatasetTrainingCollectionName = 'dataset_trainings';
@ -25,7 +26,6 @@ const TrainingDataSchema = new Schema({
},
datasetId: {
type: Schema.Types.ObjectId,
ref: DatasetCollectionName,
required: true
},
collectionId: {
@ -33,15 +33,13 @@ const TrainingDataSchema = new Schema({
ref: DatasetColCollectionName,
required: true
},
billId: {
// concat bill
type: String
},
billId: String,
mode: {
type: String,
enum: Object.keys(TrainingTypeMap),
enum: Object.values(TrainingModeEnum),
required: true
},
expireAt: {
// It will be deleted after 7 days
type: Date,
@ -88,6 +86,10 @@ const TrainingDataSchema = new Schema({
indexes: {
type: [
{
type: {
type: String,
enum: Object.values(DatasetDataIndexTypeEnum)
},
text: {
type: String,
required: true
@ -98,6 +100,19 @@ const TrainingDataSchema = new Schema({
}
});
TrainingDataSchema.virtual('dataset', {
ref: DatasetCollectionName,
localField: 'datasetId',
foreignField: '_id',
justOne: true
});
TrainingDataSchema.virtual('collection', {
ref: DatasetColCollectionName,
localField: 'collectionId',
foreignField: '_id',
justOne: true
});
try {
// lock training data(teamId); delete training data
TrainingDataSchema.index({ teamId: 1, datasetId: 1 });

View File

@ -1,6 +1,7 @@
import { NodeOutputKeyEnum } from '@fastgpt/global/core/workflow/constants';
import { DispatchNodeResponseKeyEnum } from '@fastgpt/global/core/workflow/runtime/constants';
import type {
ChatDispatchProps,
DispatchNodeResultType,
RuntimeNodeItemType
} from '@fastgpt/global/core/workflow/runtime/type';
@ -46,7 +47,7 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<
query,
requestOrigin,
chatConfig,
runningAppInfo: { teamId },
runningUserInfo,
externalProvider,
params: {
model,
@ -99,10 +100,10 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<
const globalFiles = chatValue2RuntimePrompt(query).files;
const { documentQuoteText, userFiles } = await getMultiInput({
runningUserInfo,
histories: chatHistories,
requestOrigin,
maxFiles: chatConfig?.fileSelectConfig?.maxFiles || 20,
teamId,
fileLinks,
inputFiles: globalFiles,
hasReadFilesTool
@ -289,19 +290,19 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<
};
const getMultiInput = async ({
runningUserInfo,
histories,
fileLinks,
requestOrigin,
maxFiles,
teamId,
inputFiles,
hasReadFilesTool
}: {
runningUserInfo: ChatDispatchProps['runningUserInfo'];
histories: ChatItemType[];
fileLinks?: string[];
requestOrigin?: string;
maxFiles: number;
teamId: string;
inputFiles: UserChatItemValueItemType['file'][];
hasReadFilesTool: boolean;
}) => {
@ -329,7 +330,8 @@ const getMultiInput = async ({
urls,
requestOrigin,
maxFiles,
teamId
teamId: runningUserInfo.teamId,
tmbId: runningUserInfo.tmbId
});
return {

View File

@ -11,7 +11,10 @@ import { formatModelChars2Points } from '../../../../support/wallet/usage/utils'
import type { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
import { postTextCensor } from '../../../../common/api/requestPlusApi';
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
import type { DispatchNodeResultType } from '@fastgpt/global/core/workflow/runtime/type';
import type {
ChatDispatchProps,
DispatchNodeResultType
} from '@fastgpt/global/core/workflow/runtime/type';
import { countGptMessagesTokens } from '../../../../common/string/tiktoken/index';
import {
chats2GPTMessages,
@ -69,7 +72,7 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise<ChatResp
histories,
node: { name },
query,
runningAppInfo: { teamId },
runningUserInfo,
workflowStreamResponse,
chatConfig,
params: {
@ -121,7 +124,7 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise<ChatResp
stringQuoteText,
requestOrigin,
maxFiles: chatConfig?.fileSelectConfig?.maxFiles || 20,
teamId
runningUserInfo
})
]);
@ -355,7 +358,7 @@ async function getMultiInput({
stringQuoteText,
requestOrigin,
maxFiles,
teamId
runningUserInfo
}: {
histories: ChatItemType[];
inputFiles: UserChatItemValueItemType['file'][];
@ -363,7 +366,7 @@ async function getMultiInput({
stringQuoteText?: string; // file quote
requestOrigin?: string;
maxFiles: number;
teamId: string;
runningUserInfo: ChatDispatchProps['runningUserInfo'];
}) {
// 旧版本适配====>
if (stringQuoteText) {
@ -400,7 +403,8 @@ async function getMultiInput({
urls,
requestOrigin,
maxFiles,
teamId
teamId: runningUserInfo.teamId,
tmbId: runningUserInfo.tmbId
});
return {

View File

@ -45,7 +45,7 @@ ${content.slice(0, 100)}${content.length > 100 ? '......' : ''}
export const dispatchReadFiles = async (props: Props): Promise<Response> => {
const {
requestOrigin,
runningAppInfo: { teamId },
runningUserInfo: { teamId, tmbId },
histories,
chatConfig,
node: { version },
@ -61,7 +61,8 @@ export const dispatchReadFiles = async (props: Props): Promise<Response> => {
urls: [...fileUrlList, ...filesFromHistories],
requestOrigin,
maxFiles,
teamId
teamId,
tmbId
});
return {
@ -105,12 +106,14 @@ export const getFileContentFromLinks = async ({
urls,
requestOrigin,
maxFiles,
teamId
teamId,
tmbId
}: {
urls: string[];
requestOrigin?: string;
maxFiles: number;
teamId: string;
tmbId: string;
}) => {
const parseUrlList = urls
// Remove invalid urls
@ -205,6 +208,7 @@ export const getFileContentFromLinks = async ({
extension,
isQAImport: false,
teamId,
tmbId,
buffer,
encoding
});

View File

@ -117,14 +117,16 @@ export const createTrainingUsage = async ({
billSource,
vectorModel,
agentModel,
vllmModel,
session
}: {
teamId: string;
tmbId: string;
appName: string;
billSource: UsageSourceEnum;
vectorModel: string;
agentModel: string;
vectorModel?: string;
agentModel?: string;
vllmModel?: string;
session?: ClientSession;
}) => {
const [{ _id }] = await MongoUsage.create(
@ -136,27 +138,46 @@ export const createTrainingUsage = async ({
source: billSource,
totalPoints: 0,
list: [
{
moduleName: i18nT('common:support.wallet.moduleName.index'),
model: vectorModel,
amount: 0,
inputTokens: 0,
outputTokens: 0
},
{
moduleName: i18nT('common:support.wallet.moduleName.qa'),
model: agentModel,
amount: 0,
inputTokens: 0,
outputTokens: 0
},
{
moduleName: i18nT('common:core.dataset.training.Auto mode'),
model: agentModel,
amount: 0,
inputTokens: 0,
outputTokens: 0
}
...(vectorModel
? [
{
moduleName: i18nT('account_usage:embedding_index'),
model: vectorModel,
amount: 0,
inputTokens: 0,
outputTokens: 0
}
]
: []),
...(agentModel
? [
{
moduleName: i18nT('account_usage:qa'),
model: agentModel,
amount: 0,
inputTokens: 0,
outputTokens: 0
},
{
moduleName: i18nT('account_usage:auto_index'),
model: agentModel,
amount: 0,
inputTokens: 0,
outputTokens: 0
}
]
: []),
...(vllmModel
? [
{
moduleName: i18nT('account_usage:image_parse'),
model: vllmModel,
amount: 0,
inputTokens: 0,
outputTokens: 0
}
]
: [])
]
}
],
@ -165,3 +186,31 @@ export const createTrainingUsage = async ({
return { billId: String(_id) };
};
export const createPdfParseUsage = async ({
teamId,
tmbId,
pages
}: {
teamId: string;
tmbId: string;
pages: number;
}) => {
const unitPrice = global.systemEnv?.customPdfParse?.price || 0;
const totalPoints = pages * unitPrice;
createUsage({
teamId,
tmbId,
appName: i18nT('account_usage:pdf_enhanced_parse'),
totalPoints,
source: UsageSourceEnum.pdfParse,
list: [
{
moduleName: i18nT('account_usage:pdf_enhanced_parse'),
amount: totalPoints,
pages
}
]
});
};

View File

@ -9,7 +9,7 @@ import { readXlsxRawText } from './extension/xlsx';
import { readCsvRawText } from './extension/csv';
parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
const readRawContentByFileBuffer = async (params: ReadRawTextByBuffer) => {
const read = async (params: ReadRawTextByBuffer) => {
switch (params.extension) {
case 'txt':
case 'md':
@ -41,7 +41,7 @@ parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
try {
parentPort?.postMessage({
type: 'success',
data: await readRawContentByFileBuffer(newProps)
data: await read(newProps)
});
} catch (error) {
console.log(error);

View File

@ -17,7 +17,7 @@ const MyPhotoView = (props: ImageProps) => {
loadingElement={<Loading fixed={false} />}
>
<PhotoView src={props.src}>
<MyImage cursor={'pointer'} {...props} />
<MyImage cursor={'pointer'} {...props} title={props.title || props.src} />
</PhotoView>
</PhotoProvider>
);

View File

@ -11,8 +11,8 @@ type Props = BoxProps & {
const MyBox = ({ text, isLoading, children, size, ...props }: Props, ref: any) => {
return (
<Box ref={ref} position={isLoading ? 'relative' : 'unset'} {...props}>
{isLoading && <Loading fixed={false} text={text} size={size} />}
{children}
{isLoading && <Loading fixed={false} text={text} size={size} />}
</Box>
);
};

View File

@ -1,26 +1,24 @@
import React from 'react';
import { Box, Flex, useTheme, Grid, type GridProps, HStack } from '@chakra-ui/react';
import { useTranslation } from 'next-i18next';
import MyTooltip from '../MyTooltip';
import QuestionTip from '../MyTooltip/QuestionTip';
// @ts-ignore
interface Props extends GridProps {
type Props<T> = Omit<GridProps, 'onChange'> & {
list: {
title: string;
desc?: string;
value: any;
value: T;
children?: React.ReactNode;
tooltip?: string;
}[];
align?: 'flex-top' | 'center';
value: any;
value: T;
defaultBg?: string;
activeBg?: string;
onChange: (e: any) => void;
}
onChange: (e: T) => void;
};
const LeftRadio = ({
const LeftRadio = <T = any,>({
list,
value,
align = 'flex-top',
@ -30,7 +28,7 @@ const LeftRadio = ({
activeBg = 'primary.50',
onChange,
...props
}: Props) => {
}: Props<T>) => {
const { t } = useTranslation();
const theme = useTheme();
@ -39,7 +37,7 @@ const LeftRadio = ({
{list.map((item) => (
<Flex
alignItems={item.desc ? align : 'center'}
key={item.value}
key={item.value as any}
cursor={'pointer'}
userSelect={'none'}
px={px}
@ -98,7 +96,7 @@ const LeftRadio = ({
fontSize={'sm'}
>
<Box>{typeof item.title === 'string' ? t(item.title as any) : item.title}</Box>
{!!item.tooltip && <QuestionTip label={item.tooltip} ml={1} color={'myGray.600'} />}
{!!item.tooltip && <QuestionTip label={item.tooltip} color={'myGray.600'} />}
</HStack>
</Flex>
{!!item.desc && (

View File

@ -2,6 +2,7 @@
"ai_model": "AI model",
"all": "all",
"app_name": "Application name",
"auto_index": "Auto index",
"billing_module": "Deduction module",
"confirm_export": "A total of {{total}} pieces of data were filtered out. Are you sure to export?",
"current_filter_conditions": "Current filter conditions",
@ -9,6 +10,7 @@
"details": "Details",
"dingtalk": "DingTalk",
"duration_seconds": "Duration (seconds)",
"embedding_index": "Embedding",
"every_day": "Day",
"every_month": "Moon",
"export_confirm": "Export confirmation",
@ -16,6 +18,7 @@
"export_title": "Time,Members,Type,Project name,AI points",
"feishu": "Feishu",
"generation_time": "Generation time",
"image_parse": "Image tagging",
"input_token_length": "input tokens",
"member": "member",
"member_name": "Member name",
@ -25,8 +28,12 @@
"official_account": "Official Account",
"order_number": "Order number",
"output_token_length": "output tokens",
"pages": "Pages",
"pdf_enhanced_parse": "PDF Enhanced Analysis",
"pdf_parse": "PDF Analysis",
"points": "Points",
"project_name": "Project name",
"qa": "QA",
"select_member_and_source_first": "Please select members and types first",
"share": "Share Link",
"source": "source",

View File

@ -562,10 +562,7 @@
"core.dataset.file": "File",
"core.dataset.folder": "Directory",
"core.dataset.import.Auto mode Estimated Price Tips": "Requires calling the file processing model, which consumes a lot of tokens: {{price}} points/1K tokens",
"core.dataset.import.Auto process": "Automatic",
"core.dataset.import.Auto process desc": "Automatically set segmentation and preprocessing rules",
"core.dataset.import.Chunk Range": "Range: {{min}}~{{max}}",
"core.dataset.import.Chunk Split": "Chunks",
"core.dataset.import.Chunk Split Tip": "Segment the text according to certain rules and convert it into a format that can be semantically searched. Suitable for most scenarios. No additional model processing is required, and the cost is low.",
"core.dataset.import.Continue upload": "Continue upload",
"core.dataset.import.Custom process": "Custom Rules",
@ -575,7 +572,6 @@
"core.dataset.import.Custom split char Tips": "Allows you to segment based on custom separators. Usually used for pre-processed data, using specific separators for precise segmentation.",
"core.dataset.import.Custom text": "Custom Text",
"core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset",
"core.dataset.import.Data Preprocessing": "Data Processing",
"core.dataset.import.Data process params": "Data Processing Parameters",
"core.dataset.import.Down load csv template": "Click to Download CSV Template",
"core.dataset.import.Embedding Estimated Price Tips": "Only use the index model, consuming a small amount of AI points: {{price}} points/1K tokens",
@ -597,7 +593,6 @@
"core.dataset.import.Source name": "Source Name",
"core.dataset.import.Sources list": "Sources",
"core.dataset.import.Start upload": "Start Upload",
"core.dataset.import.Total files": "Total {{total}} Files",
"core.dataset.import.Upload complete": "Upload complete",
"core.dataset.import.Upload data": "Confirm Upload",
"core.dataset.import.Upload file progress": "File Upload Progress",
@ -649,10 +644,10 @@
"core.dataset.training.Agent queue": "QA Training Queue",
"core.dataset.training.Auto mode": "Auto index",
"core.dataset.training.Auto mode Tip": "Increase the semantic richness of data blocks by generating related questions and summaries through sub-indexes and calling models, making it more conducive to retrieval. Requires more storage space and increases AI call times.",
"core.dataset.training.Chunk mode": "Default",
"core.dataset.training.Chunk mode": "Chunk",
"core.dataset.training.Full": "Estimated Over 5 Minutes",
"core.dataset.training.Leisure": "Idle",
"core.dataset.training.QA mode": "QA Chunks",
"core.dataset.training.QA mode": "QA",
"core.dataset.training.Vector queue": "Index Queue",
"core.dataset.training.Waiting": "Estimated 5 Minutes",
"core.dataset.training.Website Sync": "Website Sync",
@ -861,7 +856,6 @@
"dataset.collections.Select Collection": "Select File",
"dataset.collections.Select One Collection To Store": "Select a File to Store",
"dataset.data.Can not edit": "No Edit Permission",
"dataset.data.Custom Index Number": "Custom Index {{number}}",
"dataset.data.Default Index": "Default Index",
"dataset.data.Delete Tip": "Confirm to Delete This Data?",
"dataset.data.Index Placeholder": "Enter Index Text Content",
@ -956,6 +950,7 @@
"new_create": "Create New",
"no": "No",
"no_laf_env": "System Not Configured with Laf Environment",
"not_model_config": "No related model configured",
"not_yet_introduced": "No Introduction Yet",
"option": "Option",
"pay.amount": "Amount",
@ -1121,7 +1116,6 @@
"support.wallet.invoice_detail": "Invoice Details",
"support.wallet.invoice_info": "The invoice will be sent to the email within 3-7 working days, please wait patiently",
"support.wallet.invoicing": "Invoicing",
"support.wallet.moduleName.index": "Index Generation",
"support.wallet.moduleName.qa": "QA Split",
"support.wallet.noBill": "No Bill Records",
"support.wallet.no_invoice": "No Invoice Records",

View File

@ -3,11 +3,16 @@
"add_file": "Import",
"api_file": "API Dataset",
"api_url": "API Url",
"auto_indexes": "Automatically generate supplementary indexes",
"auto_indexes_tips": "Additional index generation is performed through large models to improve semantic richness and improve retrieval accuracy.",
"chunk_max_tokens": "max_tokens",
"close_auto_sync": "Are you sure you want to turn off automatic sync?",
"collection.Create update time": "Creation/Update Time",
"collection.Training type": "Training",
"collection.training_type": "Chunk type",
"collection_data_count": "Data amount",
"collection_metadata_custom_pdf_parse": "PDF enhancement analysis",
"collection_metadata_image_parse": "Image tagging",
"collection_not_support_retraining": "This collection type does not support retuning parameters",
"collection_not_support_sync": "This collection does not support synchronization",
"collection_sync": "Sync data",
@ -22,12 +27,21 @@
"custom_data_process_params_desc": "Customize data processing rules",
"data.ideal_chunk_length": "ideal block length",
"data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
"data_index_custom": "Custom index",
"data_index_default": "Default index",
"data_index_image": "Image Index",
"data_index_num": "Index {{index}}",
"data_index_question": "Inferred question index",
"data_index_summary": "Summary index",
"data_process_params": "Params",
"data_process_setting": "Processing config",
"dataset.Unsupported operation": "dataset.Unsupported operation",
"dataset.no_collections": "No datasets available",
"dataset.no_tags": "No tags available",
"default_params": "default",
"default_params_desc": "Use system default parameters and rules",
"edit_dataset_config": "Edit knowledge base configuration",
"enhanced_indexes": "Index enhancement",
"error.collectionNotFound": "Collection not found~",
"external_file": "External File Library",
"external_file_dataset_desc": "Import files from an external file library to build a Dataset. The files will not be stored again.",
@ -38,19 +52,38 @@
"feishu_dataset": "Feishu Dataset",
"feishu_dataset_config": "Feishu Dataset Config",
"feishu_dataset_desc": "Can build a dataset using Feishu documents by configuring permissions, without secondary storage",
"file_list": "File list",
"file_model_function_tip": "Enhances indexing and QA generation",
"filename": "Filename",
"folder_dataset": "Folder",
"ideal_chunk_length": "ideal block length",
"ideal_chunk_length_tips": "Segment according to the end symbol and combine multiple segments into one block. This value determines the estimated size of the block, if there is any fluctuation.",
"image_auto_parse": "Automatic image indexing",
"image_auto_parse_tips": "Call VLM to automatically label the pictures in the document and generate additional search indexes",
"import.Auto mode Estimated Price Tips": "The text understanding model needs to be called, which requires more points: {{price}} points/1K tokens",
"import.Embedding Estimated Price Tips": "Only use the index model and consume a small amount of AI points: {{price}} points/1K tokens",
"import_confirm": "Confirm upload",
"import_data_preview": "Data preview",
"import_data_process_setting": "Data processing method settings",
"import_file_parse_setting": "File parsing settings",
"import_model_config": "Model selection",
"import_param_setting": "Parameter settings",
"import_select_file": "Select a file",
"is_open_schedule": "Enable scheduled synchronization",
"keep_image": "Keep the picture",
"move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
"open_auto_sync": "After scheduled synchronization is turned on, the system will try to synchronize the collection from time to time every day. During the collection synchronization period, the collection data will not be searched.",
"params_setting": "Parameter settings",
"pdf_enhance_parse": "PDF enhancement analysis",
"pdf_enhance_parse_price": "{{price}} points/page",
"pdf_enhance_parse_tips": "When parsing a PDF file, the PDF recognition model is called for recognition, which can be converted into Markdown and retained the pictures in the document, and can also identify the scanned files.",
"permission.des.manage": "Can manage the entire knowledge base data and information",
"permission.des.read": "View knowledge base content",
"permission.des.write": "Ability to add and change knowledge base content",
"preview_chunk": "Preview chunks",
"preview_chunk_empty": "Unable to read the contents of the file",
"preview_chunk_intro": "Display up to 10 pieces",
"preview_chunk_not_selected": "Click on the file on the left to preview",
"rebuild_embedding_start_tip": "Index model switching task has started",
"rebuilding_index_count": "Number of indexes being rebuilt: {{count}}",
"request_headers": "Request headers, will automatically append 'Bearer '",
@ -72,8 +105,10 @@
"tag.tags": "Tags",
"tag.total_tags": "Total {{total}} tags",
"the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "The Dataset has indexes that are being trained or rebuilt",
"total_num_files": "Total {{total}} files",
"training_mode": "Chunk mode",
"vector_model_max_tokens_tip": "Each chunk of data has a maximum length of 3000 tokens",
"vllm_model": "Image understanding model",
"website_dataset": "Website Sync",
"website_dataset_desc": "Website sync allows you to build a Dataset directly using a web link.",
"yuque_dataset": "Yuque Dataset",

View File

@ -2,6 +2,7 @@
"ai_model": "AI 模型",
"all": "所有",
"app_name": "应用名",
"auto_index": "索引增强",
"billing_module": "扣费模块",
"confirm_export": "共筛选出 {{total}} 条数据,是否确认导出?",
"current_filter_conditions": "当前筛选条件:",
@ -9,6 +10,7 @@
"details": "详情",
"dingtalk": "钉钉",
"duration_seconds": "时长(秒)",
"embedding_index": "索引生成",
"every_day": "天",
"every_month": "月",
"every_week": "每周",
@ -18,6 +20,7 @@
"export_title": "时间,成员,类型,项目名,AI 积分消耗",
"feishu": "飞书",
"generation_time": "生成时间",
"image_parse": "图片标注",
"input_token_length": "输入 tokens",
"member": "成员",
"member_name": "成员名",
@ -27,8 +30,12 @@
"official_account": "公众号",
"order_number": "订单号",
"output_token_length": "输出 tokens",
"pages": "页数",
"pdf_enhanced_parse": "PDF 增强解析",
"pdf_parse": "PDF 解析",
"points": "积分",
"project_name": "项目名",
"qa": "问答对提取",
"select_member_and_source_first": "请先选中成员和类型",
"share": "分享链接",
"source": "来源",

View File

@ -565,10 +565,7 @@
"core.dataset.file": "文件",
"core.dataset.folder": "目录",
"core.dataset.import.Auto mode Estimated Price Tips": "需调用文本理解模型需要消耗较多AI 积分:{{price}} 积分/1K tokens",
"core.dataset.import.Auto process": "自动",
"core.dataset.import.Auto process desc": "自动设置分割和预处理规则",
"core.dataset.import.Chunk Range": "范围:{{min}}~{{max}}",
"core.dataset.import.Chunk Split": "直接分段",
"core.dataset.import.Chunk Split Tip": "将文本按一定的规则进行分段处理后,转成可进行语义搜索的格式,适合绝大多数场景。不需要调用模型额外处理,成本低。",
"core.dataset.import.Continue upload": "继续上传",
"core.dataset.import.Custom process": "自定义规则",
@ -578,7 +575,6 @@
"core.dataset.import.Custom split char Tips": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。",
"core.dataset.import.Custom text": "自定义文本",
"core.dataset.import.Custom text desc": "手动输入一段文本作为数据集",
"core.dataset.import.Data Preprocessing": "数据处理",
"core.dataset.import.Data process params": "数据处理参数",
"core.dataset.import.Down load csv template": "点击下载 CSV 模板",
"core.dataset.import.Embedding Estimated Price Tips": "仅使用索引模型,消耗少量 AI 积分:{{price}} 积分/1K tokens",
@ -600,7 +596,6 @@
"core.dataset.import.Source name": "来源名",
"core.dataset.import.Sources list": "来源列表",
"core.dataset.import.Start upload": "开始上传",
"core.dataset.import.Total files": "共 {{total}} 个文件",
"core.dataset.import.Upload complete": "完成上传",
"core.dataset.import.Upload data": "确认上传",
"core.dataset.import.Upload file progress": "文件上传进度",
@ -650,12 +645,12 @@
"core.dataset.test.test result placeholder": "测试结果将在这里展示",
"core.dataset.test.test result tip": "根据知识库内容与测试文本的相似度进行排序,你可以根据测试结果调整对应的文本。\n注意测试记录中的数据可能已经被修改过点击某条测试数据后将展示最新的数据。",
"core.dataset.training.Agent queue": "QA 训练排队",
"core.dataset.training.Auto mode": "增强处理",
"core.dataset.training.Auto mode": "补充索引",
"core.dataset.training.Auto mode Tip": "通过子索引以及调用模型生成相关问题与摘要,来增加数据块的语义丰富度,更利于检索。需要消耗更多的存储空间和增加 AI 调用次数。",
"core.dataset.training.Chunk mode": "直接分",
"core.dataset.training.Chunk mode": "直接分",
"core.dataset.training.Full": "预计 5 分钟以上",
"core.dataset.training.Leisure": "空闲",
"core.dataset.training.QA mode": "问答拆分",
"core.dataset.training.QA mode": "问答对提取",
"core.dataset.training.Vector queue": "索引排队",
"core.dataset.training.Waiting": "预计 5 分钟",
"core.dataset.training.Website Sync": "Web 站点同步",
@ -864,7 +859,6 @@
"dataset.collections.Select Collection": "选择文件",
"dataset.collections.Select One Collection To Store": "选择一个文件进行存储",
"dataset.data.Can not edit": "无编辑权限",
"dataset.data.Custom Index Number": "自定义索引{{number}}",
"dataset.data.Default Index": "默认索引",
"dataset.data.Delete Tip": "确认删除该条数据?",
"dataset.data.Index Placeholder": "输入索引文本内容",
@ -959,6 +953,7 @@
"new_create": "新建",
"no": "否",
"no_laf_env": "系统未配置Laf环境",
"not_model_config": "未配置相关模型",
"not_yet_introduced": "暂无介绍",
"option": "选项",
"pay.amount": "金额",
@ -1124,7 +1119,6 @@
"support.wallet.invoice_detail": "发票详情",
"support.wallet.invoice_info": "发票将在 3-7 个工作日内发送至邮箱,请耐心等待",
"support.wallet.invoicing": "开票",
"support.wallet.moduleName.index": "索引生成",
"support.wallet.moduleName.qa": "QA 拆分",
"support.wallet.noBill": "无账单记录~",
"support.wallet.no_invoice": "暂无开票记录",

View File

@ -3,11 +3,16 @@
"add_file": "添加文件",
"api_file": "API 文件库",
"api_url": "接口地址",
"auto_indexes": "自动生成补充索引",
"auto_indexes_tips": "通过大模型进行额外索引生成,提高语义丰富度,提高检索的精度。",
"chunk_max_tokens": "分块上限",
"close_auto_sync": "确认关闭自动同步功能?",
"collection.Create update time": "创建/更新时间",
"collection.Training type": "训练模式",
"collection.training_type": "处理模式",
"collection_data_count": "数据量",
"collection_metadata_custom_pdf_parse": "PDF增强解析",
"collection_metadata_image_parse": "图片标注",
"collection_not_support_retraining": "该集合类型不支持重新调整参数",
"collection_not_support_sync": "该集合不支持同步",
"collection_sync": "立即同步",
@ -22,12 +27,21 @@
"custom_data_process_params_desc": "自定义设置数据处理规则",
"data.ideal_chunk_length": "理想分块长度",
"data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
"data_index_custom": "自定义索引",
"data_index_default": "默认索引",
"data_index_image": "图片索引",
"data_index_num": "索引 {{index}}",
"data_index_question": "推测问题索引",
"data_index_summary": "摘要索引",
"data_process_params": "处理参数",
"data_process_setting": "数据处理配置",
"dataset.Unsupported operation": "操作不支持",
"dataset.no_collections": "暂无数据集",
"dataset.no_tags": "暂无标签",
"default_params": "默认",
"default_params_desc": "使用系统默认的参数和规则",
"edit_dataset_config": "编辑知识库配置",
"enhanced_indexes": "索引增强",
"error.collectionNotFound": "集合找不到了~",
"external_file": "外部文件库",
"external_file_dataset_desc": "可以从外部文件库导入文件构建知识库,文件不会进行二次存储",
@ -38,19 +52,38 @@
"feishu_dataset": "飞书知识库",
"feishu_dataset_config": "配置飞书知识库",
"feishu_dataset_desc": "可通过配置飞书文档权限,使用飞书文档构建知识库,文档不会进行二次存储",
"file_list": "文件列表",
"file_model_function_tip": "用于增强索引和 QA 生成",
"filename": "文件名",
"folder_dataset": "文件夹",
"ideal_chunk_length": "理想分块长度",
"ideal_chunk_length_tips": "按结束符号进行分段,并将多个分段组成一个分块,该值决定了分块的预估大小,如果会有上下浮动。",
"image_auto_parse": "图片自动索引",
"image_auto_parse_tips": "调用 VLM 自动标注文档里的图片,并生成额外的检索索引",
"import.Auto mode Estimated Price Tips": "需调用文本理解模型需要消耗较多AI 积分:{{price}} 积分/1K tokens",
"import.Embedding Estimated Price Tips": "仅使用索引模型,消耗少量 AI 积分:{{price}} 积分/1K tokens",
"import_confirm": "确认上传",
"import_data_preview": "数据预览",
"import_data_process_setting": "数据处理方式设置",
"import_file_parse_setting": "文件解析设置",
"import_model_config": "模型选择",
"import_param_setting": "参数设置",
"import_select_file": "选择文件",
"is_open_schedule": "启用定时同步",
"keep_image": "保留图片",
"move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。",
"open_auto_sync": "开启定时同步后,系统将会每天不定时尝试同步集合,集合同步期间,会出现无法搜索到该集合数据现象。",
"params_setting": "参数设置",
"pdf_enhance_parse": "PDF增强解析",
"pdf_enhance_parse_price": "{{price}}积分/页",
"pdf_enhance_parse_tips": "解析 PDF 文件时,调用 PDF 识别模型进行识别,可以将其转换成 Markdown 并保留文档中的图片,同时也可以对扫描件进行识别。",
"permission.des.manage": "可管理整个知识库数据和信息",
"permission.des.read": "可查看知识库内容",
"permission.des.write": "可增加和变更知识库内容",
"preview_chunk": "分块预览",
"preview_chunk_empty": "无法读取该文件内容",
"preview_chunk_intro": "最多展示 10 个分块",
"preview_chunk_not_selected": "点击左侧文件后进行预览",
"rebuild_embedding_start_tip": "切换索引模型任务已开始",
"rebuilding_index_count": "重建中索引数量:{{count}}",
"request_headers": "请求头参数,会自动补充 Bearer",
@ -72,8 +105,10 @@
"tag.tags": "标签",
"tag.total_tags": "共{{total}}个标签",
"the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "知识库有训练中或正在重建的索引",
"total_num_files": "共 {{total}} 个文件",
"training_mode": "处理方式",
"vector_model_max_tokens_tip": "每个分块数据,最大长度为 3000 tokens",
"vllm_model": "图片理解模型",
"website_dataset": "Web 站点同步",
"website_dataset_desc": "Web 站点同步允许你直接使用一个网页链接构建知识库",
"yuque_dataset": "语雀知识库",

View File

@ -2,6 +2,7 @@
"ai_model": "AI 模型",
"all": "所有",
"app_name": "應用程式名",
"auto_index": "索引增強",
"billing_module": "扣費模組",
"confirm_export": "共篩選出 {{total}} 條數據,是否確認導出?",
"current_filter_conditions": "當前篩選條件:",
@ -9,6 +10,7 @@
"details": "詳情",
"dingtalk": "釘釘",
"duration_seconds": "時長(秒)",
"embedding_index": "索引生成",
"every_day": "天",
"every_month": "月",
"export_confirm": "導出確認",
@ -16,6 +18,7 @@
"export_title": "時間,成員,類型,項目名,AI 積分消耗",
"feishu": "飛書",
"generation_time": "生成時間",
"image_parse": "圖片標註",
"input_token_length": "輸入 tokens",
"member": "成員",
"member_name": "成員名",
@ -25,8 +28,12 @@
"official_account": "公眾號",
"order_number": "訂單編號",
"output_token_length": "輸出 tokens",
"pages": "頁數",
"pdf_enhanced_parse": "PDF 增強解析",
"pdf_parse": "PDF 解析",
"points": "積分",
"project_name": "專案名",
"qa": "問答對提取",
"select_member_and_source_first": "請先選取成員和類型",
"share": "分享連結",
"source": "來源",

View File

@ -561,10 +561,7 @@
"core.dataset.file": "檔案",
"core.dataset.folder": "目錄",
"core.dataset.import.Auto mode Estimated Price Tips": "需要呼叫檔案處理模型,將消耗較多 AI 點數:{{price}} 點數/1K tokens",
"core.dataset.import.Auto process": "自動",
"core.dataset.import.Auto process desc": "自動設定分割和預處理規則",
"core.dataset.import.Chunk Range": "範圍:{{min}}~{{max}}",
"core.dataset.import.Chunk Split": "直接分段",
"core.dataset.import.Chunk Split Tip": "將文字依照特定規則進行分段處理後,轉換成可進行語意搜尋的格式,適合大多數場景。不需要呼叫模型額外處理,成本較低。",
"core.dataset.import.Continue upload": "繼續上傳",
"core.dataset.import.Custom process": "自訂規則",
@ -574,7 +571,6 @@
"core.dataset.import.Custom split char Tips": "允許您根據自訂的分隔符進行分割。通常用於已處理好的資料,使用特定的分隔符來精確分割。",
"core.dataset.import.Custom text": "自訂文字",
"core.dataset.import.Custom text desc": "手動輸入一段文字作為資料集",
"core.dataset.import.Data Preprocessing": "資料處理",
"core.dataset.import.Data process params": "資料處理參數",
"core.dataset.import.Down load csv template": "點選下載 CSV 範本",
"core.dataset.import.Embedding Estimated Price Tips": "僅使用索引模型,消耗少量 AI 點數:{{price}} 點數/1K tokens",
@ -596,7 +592,6 @@
"core.dataset.import.Source name": "來源名稱",
"core.dataset.import.Sources list": "來源列表",
"core.dataset.import.Start upload": "開始上傳",
"core.dataset.import.Total files": "共 {{total}} 個檔案",
"core.dataset.import.Upload complete": "上傳完成",
"core.dataset.import.Upload data": "確認上傳",
"core.dataset.import.Upload file progress": "檔案上傳進度",
@ -646,12 +641,12 @@
"core.dataset.test.test result placeholder": "測試結果將顯示在這裡",
"core.dataset.test.test result tip": "根據知識庫內容與測試文字的相似度進行排序。您可以根據測試結果調整相應的文字。\n注意測試記錄中的資料可能已經被修改。點選某筆測試資料後將顯示最新資料。",
"core.dataset.training.Agent queue": "問答訓練排隊中",
"core.dataset.training.Auto mode": "增強處理",
"core.dataset.training.Auto mode": "補充索引",
"core.dataset.training.Auto mode Tip": "透過子索引以及呼叫模型產生相關問題與摘要,來增加資料區塊的語意豐富度,更有利於檢索。需要消耗更多的儲存空間並增加 AI 呼叫次數。",
"core.dataset.training.Chunk mode": "直接分",
"core.dataset.training.Chunk mode": "直接分",
"core.dataset.training.Full": "預計超過 5 分鐘",
"core.dataset.training.Leisure": "閒置",
"core.dataset.training.QA mode": "問答拆分",
"core.dataset.training.QA mode": "問答對提取",
"core.dataset.training.Vector queue": "索引排隊中",
"core.dataset.training.Waiting": "預計 5 分鐘",
"core.dataset.training.Website Sync": "網站同步",
@ -861,7 +856,6 @@
"dataset.collections.Select Collection": "選擇檔案",
"dataset.collections.Select One Collection To Store": "選擇一個檔案進行儲存",
"dataset.data.Can not edit": "無編輯權限",
"dataset.data.Custom Index Number": "自訂索引 {{number}}",
"dataset.data.Default Index": "預設索引",
"dataset.data.Delete Tip": "確認刪除此資料?",
"dataset.data.Index Placeholder": "輸入索引文字內容",
@ -955,6 +949,7 @@
"new_create": "建立新項目",
"no": "否",
"no_laf_env": "系統未設定 LAF 環境",
"not_model_config": "未配置相關模型",
"not_yet_introduced": "暫無介紹",
"option": "選項",
"pay.amount": "金額",
@ -1120,7 +1115,6 @@
"support.wallet.invoice_detail": "發票詳細資訊",
"support.wallet.invoice_info": "發票將在 3-7 個工作天內寄送至電子郵件信箱,請耐心等候",
"support.wallet.invoicing": "開立發票",
"support.wallet.moduleName.index": "產生索引",
"support.wallet.moduleName.qa": "問答拆分",
"support.wallet.noBill": "無帳單紀錄",
"support.wallet.no_invoice": "無發票紀錄",

View File

@ -3,11 +3,16 @@
"add_file": "新增文件",
"api_file": "API 檔案庫",
"api_url": "介面位址",
"auto_indexes": "自動生成補充索引",
"auto_indexes_tips": "通過大模型進行額外索引生成,提高語義豐富度,提高檢索的精度。",
"chunk_max_tokens": "分塊上限",
"close_auto_sync": "確認關閉自動同步功能?",
"collection.Create update time": "建立/更新時間",
"collection.Training type": "分段模式",
"collection.training_type": "處理模式",
"collection_data_count": "數據量",
"collection_metadata_custom_pdf_parse": "PDF增強解析",
"collection_metadata_image_parse": "圖片標註",
"collection_not_support_retraining": "此集合類型不支援重新調整參數",
"collection_not_support_sync": "該集合不支援同步",
"collection_sync": "立即同步",
@ -22,12 +27,21 @@
"custom_data_process_params_desc": "自訂資料處理規則",
"data.ideal_chunk_length": "理想分塊長度",
"data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引",
"data_index_custom": "自定義索引",
"data_index_default": "默認索引",
"data_index_image": "圖片索引",
"data_index_num": "索引 {{index}}",
"data_index_question": "推測問題索引",
"data_index_summary": "摘要索引",
"data_process_params": "處理參數",
"data_process_setting": "資料處理設定",
"dataset.Unsupported operation": "操作不支持",
"dataset.no_collections": "尚無資料集",
"dataset.no_tags": "尚無標籤",
"default_params": "預設",
"default_params_desc": "使用系統默認的參數和規則",
"edit_dataset_config": "編輯知識庫配置",
"enhanced_indexes": "索引增強",
"error.collectionNotFound": "找不到集合",
"external_file": "外部檔案庫",
"external_file_dataset_desc": "可以從外部檔案庫匯入檔案建立資料集,檔案不會進行二次儲存",
@ -38,19 +52,38 @@
"feishu_dataset": "飛書知識庫",
"feishu_dataset_config": "配置飛書知識庫",
"feishu_dataset_desc": "可通過配置飛書文檔權限,使用飛書文檔構建知識庫,文檔不會進行二次存儲",
"file_list": "文件列表",
"file_model_function_tip": "用於增強索引和問答生成",
"filename": "檔案名稱",
"folder_dataset": "資料夾",
"ideal_chunk_length": "理想分塊長度",
"ideal_chunk_length_tips": "依結束符號進行分段,並將多個分段組成一個分塊,此值決定了分塊的預估大小,可能會有上下浮動。",
"image_auto_parse": "圖片自動索引",
"image_auto_parse_tips": "調用 VLM 自動標註文檔裡的圖片,並生成額外的檢索索引",
"import.Auto mode Estimated Price Tips": "需呼叫文字理解模型,將消耗較多 AI 點數:{{price}} 點數 / 1K tokens",
"import.Embedding Estimated Price Tips": "僅使用索引模型,消耗少量 AI 點數:{{price}} 點數 / 1K tokens",
"import_confirm": "確認上傳",
"import_data_preview": "數據預覽",
"import_data_process_setting": "數據處理方式設置",
"import_file_parse_setting": "文件解析設置",
"import_model_config": "模型選擇",
"import_param_setting": "參數設置",
"import_select_file": "選擇文件",
"is_open_schedule": "啟用定時同步",
"keep_image": "保留圖片",
"move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。",
"open_auto_sync": "開啟定時同步後,系統將每天不定時嘗試同步集合,集合同步期間,會出現無法搜尋到該集合資料現象。",
"params_setting": "參數設置",
"pdf_enhance_parse": "PDF增強解析",
"pdf_enhance_parse_price": "{{price}}積分/頁",
"pdf_enhance_parse_tips": "解析 PDF 文件時,調用 PDF 識別模型進行識別,可以將其轉換成 Markdown 並保留文檔中的圖片,同時也可以對掃描件進行識別。",
"permission.des.manage": "可管理整個資料集的資料和資訊",
"permission.des.read": "可檢視資料集內容",
"permission.des.write": "可新增和變更資料集內容",
"preview_chunk": "分塊預覽",
"preview_chunk_empty": "無法讀取該文件內容",
"preview_chunk_intro": "最多展示 10 個分塊",
"preview_chunk_not_selected": "點擊左側文件後進行預覽",
"rebuild_embedding_start_tip": "切換索引模型任務已開始",
"rebuilding_index_count": "重建中索引數量:{{count}}",
"request_headers": "請求頭",
@ -72,8 +105,10 @@
"tag.tags": "標籤",
"tag.total_tags": "共 {{total}} 個標籤",
"the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "資料集有索引正在訓練或重建中",
"total_num_files": "共 {{total}} 個文件",
"training_mode": "分段模式",
"vector_model_max_tokens_tip": "每個分塊數據,最大長度為 3000 tokens",
"vllm_model": "圖片理解模型",
"website_dataset": "網站同步",
"website_dataset_desc": "網站同步功能讓您可以直接使用網頁連結建立資料集",
"yuque_dataset": "語雀知識庫",

View File

@ -4,8 +4,9 @@
"lafEnv": "https://laf.dev" // laf https://laf.run ,laf使 Laf openapi laf
},
"systemEnv": {
"vectorMaxProcess": 15, // 线
"qaMaxProcess": 15, // 线
"vectorMaxProcess": 10, // 线
"qaMaxProcess": 10, // 线
"vlmMaxProcess": 10, //
"tokenWorkers": 30, // Token 线
"pgHNSWEfSearch": 100 // 10099%+
}

View File

@ -35,19 +35,18 @@ const OneRowSelector = ({ list, onchange, disableTip, ...props }: Props) => {
return props.size ? size[props.size] : size['md'];
}, [props.size]);
const avatarList = useMemo(
() =>
list.map((item) => {
const modelData = getModelFromList(
[
...llmModelList,
...embeddingModelList,
...ttsModelList,
...sttModelList,
...reRankModelList
],
item.value
);
const avatarList = useMemo(() => {
const allModels = [
...llmModelList,
...embeddingModelList,
...ttsModelList,
...sttModelList,
...reRankModelList
];
return list
.map((item) => {
const modelData = getModelFromList(allModels, item.value)!;
if (!modelData) return;
return {
value: item.value,
@ -64,17 +63,20 @@ const OneRowSelector = ({ list, onchange, disableTip, ...props }: Props) => {
</Flex>
)
};
}),
[
list,
llmModelList,
embeddingModelList,
ttsModelList,
sttModelList,
reRankModelList,
avatarSize
]
);
})
.filter(Boolean) as {
value: any;
label: React.JSX.Element;
}[];
}, [
list,
llmModelList,
embeddingModelList,
ttsModelList,
sttModelList,
reRankModelList,
avatarSize
]);
return (
<Box
@ -91,6 +93,7 @@ const OneRowSelector = ({ list, onchange, disableTip, ...props }: Props) => {
className="nowheel"
isDisabled={!!disableTip}
list={avatarList}
placeholder={t('common:not_model_config')}
h={'40px'}
{...props}
onchange={(e) => {
@ -112,13 +115,15 @@ const MultipleRowSelector = ({ list, onchange, disableTip, ...props }: Props) =>
const { llmModelList, embeddingModelList, ttsModelList, sttModelList, reRankModelList } =
useSystemStore();
const modelList = useMemo(() => {
return [
const allModels = [
...llmModelList,
...embeddingModelList,
...ttsModelList,
...sttModelList,
...reRankModelList
];
return list.map((item) => getModelFromList(allModels, item.value)!).filter(Boolean);
}, [llmModelList, embeddingModelList, ttsModelList, sttModelList, reRankModelList]);
const [value, setValue] = useState<string[]>([]);
@ -157,6 +162,7 @@ const MultipleRowSelector = ({ list, onchange, disableTip, ...props }: Props) =>
for (const item of list) {
const modelData = getModelFromList(modelList, item.value);
if (!modelData) continue;
const provider =
renderList.find((item) => item.value === (modelData?.provider || 'Other')) ??
renderList[renderList.length - 1];
@ -179,6 +185,7 @@ const MultipleRowSelector = ({ list, onchange, disableTip, ...props }: Props) =>
const SelectedModel = useMemo(() => {
const modelData = getModelFromList(modelList, props.value);
if (!modelData) return <>{t('common:not_model_config')}</>;
setValue([modelData.provider, props.value]);

View File

@ -26,6 +26,7 @@ export type CreateDatasetParams = {
avatar: string;
vectorModel?: string;
agentModel?: string;
vlmModel?: string;
apiServer?: APIFileServer;
feishuServer?: FeishuServer;
yuqueServer?: YuqueServer;

View File

@ -23,7 +23,7 @@ import MyModal from '@fastgpt/web/components/common/MyModal';
import MyTag from '@fastgpt/web/components/common/Tag/index';
import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
import { getErrText } from '@fastgpt/global/common/error/utils';
import { batchRun } from '@fastgpt/global/common/fn/utils';
import { batchRun } from '@fastgpt/global/common/system/utils';
import { useToast } from '@fastgpt/web/hooks/useToast';
type ModelTestItem = {

View File

@ -26,7 +26,7 @@ const UsageDetail = ({ usage, onClose }: { usage: UsageItemType; onClose: () =>
[usage.list]
);
const { hasModel, hasToken, hasInputToken, hasOutputToken, hasCharsLen, hasDuration } =
const { hasModel, hasToken, hasInputToken, hasOutputToken, hasCharsLen, hasDuration, hasPages } =
useMemo(() => {
let hasModel = false;
let hasToken = false;
@ -34,7 +34,7 @@ const UsageDetail = ({ usage, onClose }: { usage: UsageItemType; onClose: () =>
let hasOutputToken = false;
let hasCharsLen = false;
let hasDuration = false;
let hasDataLen = false;
let hasPages = false;
usage.list.forEach((item) => {
if (item.model !== undefined) {
@ -56,6 +56,9 @@ const UsageDetail = ({ usage, onClose }: { usage: UsageItemType; onClose: () =>
if (typeof item.duration === 'number') {
hasDuration = true;
}
if (typeof item.pages === 'number') {
hasPages = true;
}
});
return {
@ -65,7 +68,7 @@ const UsageDetail = ({ usage, onClose }: { usage: UsageItemType; onClose: () =>
hasOutputToken,
hasCharsLen,
hasDuration,
hasDataLen
hasPages
};
}, [usage.list]);
@ -113,6 +116,7 @@ const UsageDetail = ({ usage, onClose }: { usage: UsageItemType; onClose: () =>
{hasOutputToken && <Th>{t('account_usage:output_token_length')}</Th>}
{hasCharsLen && <Th>{t('account_usage:text_length')}</Th>}
{hasDuration && <Th>{t('account_usage:duration_seconds')}</Th>}
{hasPages && <Th>{t('account_usage:pages')}</Th>}
<Th>{t('account_usage:total_points_consumed')}</Th>
</Tr>
</Thead>
@ -126,6 +130,7 @@ const UsageDetail = ({ usage, onClose }: { usage: UsageItemType; onClose: () =>
{hasOutputToken && <Td>{item.outputTokens ?? '-'}</Td>}
{hasCharsLen && <Td>{item.charsLength ?? '-'}</Td>}
{hasDuration && <Td>{item.duration ?? '-'}</Td>}
{hasPages && <Td>{item.pages ?? '-'}</Td>}
<Td>{formatNumber(item.amount)}</Td>
</Tr>
))}

View File

@ -87,8 +87,8 @@ const UsageTableList = ({
'common:support.wallet.usage.Audio Speech'
),
['support.wallet.usage.Whisper']: t('common:support.wallet.usage.Whisper'),
['support.wallet.moduleName.index']: t('common:support.wallet.moduleName.index'),
['support.wallet.moduleName.qa']: t('common:support.wallet.moduleName.qa'),
['account_usage:embedding_index']: t('account_usage:embedding_index'),
['account_usage:qa']: t('account_usage:qa'),
['core.dataset.training.Auto mode']: t('common:core.dataset.training.Auto mode'),
['common:core.module.template.ai_chat']: t('common:core.module.template.ai_chat')
},
@ -122,49 +122,51 @@ const UsageTableList = ({
onConfirm={exportUsage}
/>
</Flex>
<MyBox position={'relative'} overflowY={'auto'} mt={3} flex={1} isLoading={isLoading}>
<TableContainer>
<Table>
<Thead>
<Tr>
<Th>{t('common:user.Time')}</Th>
<Th>{t('account_usage:member')}</Th>
<Th>{t('account_usage:user_type')}</Th>
<Th>{t('account_usage:project_name')}</Th>
<Th>{t('account_usage:total_points')}</Th>
<Th></Th>
</Tr>
</Thead>
<Tbody fontSize={'sm'}>
{usages.map((item) => (
<Tr key={item.id}>
<Td>{dayjs(item.time).format('YYYY/MM/DD HH:mm:ss')}</Td>
<Td>
<Flex alignItems={'center'} color={'myGray.500'}>
<Avatar src={item.sourceMember.avatar} w={'20px'} mr={1} rounded={'full'} />
{item.sourceMember.name}
</Flex>
</Td>
<Td>{t(UsageSourceMap[item.source]?.label as any) || '-'}</Td>
<Td>{t(item.appName as any) || '-'}</Td>
<Td>{formatNumber(item.totalPoints) || 0}</Td>
<Td>
<Button
size={'sm'}
variant={'whitePrimary'}
onClick={() => setUsageDetail(item)}
>
{t('account_usage:details')}
</Button>
</Td>
<MyBox mt={3} flex={'1 0 0'} h={0} isLoading={isLoading}>
<Box h={'100%'} overflow={'auto'}>
<TableContainer>
<Table>
<Thead>
<Tr>
<Th>{t('common:user.Time')}</Th>
<Th>{t('account_usage:member')}</Th>
<Th>{t('account_usage:user_type')}</Th>
<Th>{t('account_usage:project_name')}</Th>
<Th>{t('account_usage:total_points')}</Th>
<Th></Th>
</Tr>
))}
</Tbody>
</Table>
{!isLoading && usages.length === 0 && (
<EmptyTip text={t('account_usage:no_usage_records')}></EmptyTip>
)}
</TableContainer>
</Thead>
<Tbody fontSize={'sm'}>
{usages.map((item) => (
<Tr key={item.id}>
<Td>{dayjs(item.time).format('YYYY/MM/DD HH:mm:ss')}</Td>
<Td>
<Flex alignItems={'center'} color={'myGray.500'}>
<Avatar src={item.sourceMember.avatar} w={'20px'} mr={1} rounded={'full'} />
{item.sourceMember.name}
</Flex>
</Td>
<Td>{t(UsageSourceMap[item.source]?.label as any) || '-'}</Td>
<Td>{t(item.appName as any) || '-'}</Td>
<Td>{formatNumber(item.totalPoints) || 0}</Td>
<Td>
<Button
size={'sm'}
variant={'whitePrimary'}
onClick={() => setUsageDetail(item)}
>
{t('account_usage:details')}
</Button>
</Td>
</Tr>
))}
</Tbody>
</Table>
{!isLoading && usages.length === 0 && (
<EmptyTip text={t('account_usage:no_usage_records')}></EmptyTip>
)}
</TableContainer>
</Box>
</MyBox>
<Flex mt={3} justifyContent={'center'}>
<Pagination />

View File

@ -18,7 +18,7 @@ import { useQuery } from '@tanstack/react-query';
import { useTranslation } from 'next-i18next';
import MyIcon from '@fastgpt/web/components/common/Icon';
import MyInput from '@/components/MyInput';
import { useRequest } from '@fastgpt/web/hooks/useRequest';
import { useRequest, useRequest2 } from '@fastgpt/web/hooks/useRequest';
import { useRouter } from 'next/router';
import { useSystemStore } from '@/web/common/system/useSystemStore';
import MyMenu from '@fastgpt/web/components/common/MyMenu';
@ -28,7 +28,8 @@ import {
TrainingModeEnum,
DatasetTypeEnum,
DatasetTypeMap,
DatasetStatusEnum
DatasetStatusEnum,
DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants';
import EditFolderModal, { useEditFolder } from '../../EditFolderModal';
import { TabEnum } from '../../../../pages/dataset/detail/index';
@ -41,6 +42,7 @@ import { CollectionPageContext } from './Context';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { useSystem } from '@fastgpt/web/hooks/useSystem';
import HeaderTagPopOver from './HeaderTagPopOver';
import MyBox from '@fastgpt/web/components/common/MyBox';
const FileSourceSelector = dynamic(() => import('../Import/components/FileSourceSelector'));
@ -48,7 +50,7 @@ const Header = ({}: {}) => {
const { t } = useTranslation();
const theme = useTheme();
const { setLoading, feConfigs } = useSystemStore();
const { feConfigs } = useSystemStore();
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
const router = useRouter();
@ -69,50 +71,36 @@ const Header = ({}: {}) => {
tip: t('common:dataset.Manual collection Tip'),
canEmpty: false
});
const {
isOpen: isOpenFileSourceSelector,
onOpen: onOpenFileSourceSelector,
onClose: onCloseFileSourceSelector
} = useDisclosure();
const { mutate: onCreateCollection } = useRequest({
mutationFn: async ({
name,
type,
callback,
...props
}: {
name: string;
type: DatasetCollectionTypeEnum;
callback?: (id: string) => void;
trainingType?: TrainingModeEnum;
rawLink?: string;
chunkSize?: number;
}) => {
setLoading(true);
const { runAsync: onCreateCollection, loading: onCreating } = useRequest2(
async ({ name, type }: { name: string; type: DatasetCollectionTypeEnum }) => {
const id = await postDatasetCollection({
parentId,
datasetId: datasetDetail._id,
name,
type,
...props
type
});
callback?.(id);
return id;
},
onSuccess() {
getData(pageNum);
},
onSettled() {
setLoading(false);
},
{
onSuccess() {
getData(pageNum);
},
successToast: t('common:common.Create Success'),
errorToast: t('common:common.Create Failed')
}
);
successToast: t('common:common.Create Success'),
errorToast: t('common:common.Create Failed')
});
const isWebSite = datasetDetail?.type === DatasetTypeEnum.websiteDataset;
return (
<Box display={['block', 'flex']} alignItems={'center'} gap={2}>
<MyBox isLoading={onCreating} display={['block', 'flex']} alignItems={'center'} gap={2}>
<HStack flex={1}>
<Box flex={1} fontWeight={'500'} color={'myGray.900'} whiteSpace={'nowrap'}>
<ParentPath
@ -446,7 +434,7 @@ const Header = ({}: {}) => {
)}
<EditCreateVirtualFileModal iconSrc={'modal/manualDataset'} closeBtnText={''} />
{isOpenFileSourceSelector && <FileSourceSelector onClose={onCloseFileSourceSelector} />}
</Box>
</MyBox>
);
};

View File

@ -29,7 +29,8 @@ import {
DatasetCollectionTypeEnum,
DatasetStatusEnum,
DatasetCollectionSyncResultMap,
DatasetTypeEnum
DatasetTypeEnum,
DatasetCollectionDataProcessModeMap
} from '@fastgpt/global/core/dataset/constants';
import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils';
import { TabEnum } from '../../../../pages/dataset/detail/index';
@ -44,10 +45,7 @@ import { CollectionPageContext } from './Context';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { formatTime2YMDHM } from '@fastgpt/global/common/string/time';
import MyTag from '@fastgpt/web/components/common/Tag/index';
import {
checkCollectionIsFolder,
getTrainingTypeLabel
} from '@fastgpt/global/core/dataset/collection/utils';
import { checkCollectionIsFolder } from '@fastgpt/global/core/dataset/collection/utils';
import { useFolderDrag } from '@/components/common/folder/useFolderDrag';
import TagsPopOver from './TagsPopOver';
import { useSystemStore } from '@/web/common/system/useSystemStore';
@ -194,7 +192,7 @@ const CollectionCard = () => {
<Thead draggable={false}>
<Tr>
<Th py={4}>{t('common:common.Name')}</Th>
<Th py={4}>{t('dataset:collection.Training type')}</Th>
<Th py={4}>{t('dataset:collection.training_type')}</Th>
<Th py={4}>{t('dataset:collection_data_count')}</Th>
<Th py={4}>{t('dataset:collection.Create update time')}</Th>
<Th py={4}>{t('common:common.Status')}</Th>
@ -251,7 +249,14 @@ const CollectionCard = () => {
</Td>
<Td py={2}>
{!checkCollectionIsFolder(collection.type) ? (
<>{t((getTrainingTypeLabel(collection.trainingType) || '-') as any)}</>
<>
{collection.trainingType
? t(
(DatasetCollectionDataProcessModeMap[collection.trainingType]
?.label || '-') as any
)
: '-'}
</>
) : (
'-'
)}

View File

@ -1,13 +1,16 @@
import { useRouter } from 'next/router';
import { SetStateAction, useState } from 'react';
import { SetStateAction, useMemo, useState } from 'react';
import { useTranslation } from 'next-i18next';
import { createContext, useContextSelector } from 'use-context-selector';
import { ImportDataSourceEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import {
DatasetCollectionDataProcessModeEnum,
ImportDataSourceEnum
} from '@fastgpt/global/core/dataset/constants';
import { useMyStep } from '@fastgpt/web/hooks/useStep';
import { Box, Button, Flex, IconButton } from '@chakra-ui/react';
import MyIcon from '@fastgpt/web/components/common/Icon';
import { TabEnum } from '../NavBar';
import { ImportProcessWayEnum } from '@/web/core/dataset/constants';
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants';
import { UseFormReturn, useForm } from 'react-hook-form';
import { ImportSourceItemType } from '@/web/core/dataset/type';
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
@ -19,12 +22,10 @@ type TrainingFiledType = {
minChunkSize: number;
autoChunkSize: number;
chunkSize: number;
showChunkInput: boolean;
showPromptInput: boolean;
charsPointsPrice: number;
priceTip: string;
uploadRate: number;
chunkSizeField?: ChunkSizeFieldType;
chunkSizeField: ChunkSizeFieldType;
};
type DatasetImportContextType = {
importSource: ImportDataSourceEnum;
@ -39,8 +40,13 @@ type DatasetImportContextType = {
type ChunkSizeFieldType = 'embeddingChunkSize' | 'qaChunkSize';
export type ImportFormType = {
mode: TrainingModeEnum;
way: ImportProcessWayEnum;
customPdfParse: boolean;
trainingType: DatasetCollectionDataProcessModeEnum;
imageIndex: boolean;
autoIndexes: boolean;
chunkSettingMode: ChunkSettingModeEnum;
embeddingChunkSize: number;
qaChunkSize: number;
customSplitChar: string;
@ -58,8 +64,6 @@ export const DatasetImportContext = createContext<DatasetImportContextType>({
maxChunkSize: 0,
minChunkSize: 0,
showChunkInput: false,
showPromptInput: false,
sources: [],
setSources: function (value: SetStateAction<ImportSourceItemType[]>): void {
throw new Error('Function not implemented.');
@ -88,72 +92,93 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
const modeSteps: Record<ImportDataSourceEnum, { title: string }[]> = {
[ImportDataSourceEnum.reTraining]: [
{ title: t('dataset:core.dataset.import.Adjust parameters') },
{ title: t('common:core.dataset.import.Upload data') }
{
title: t('dataset:import_data_preview')
},
{ title: t('dataset:import_confirm') }
],
[ImportDataSourceEnum.fileLocal]: [
{
title: t('common:core.dataset.import.Select file')
title: t('dataset:import_select_file')
},
{
title: t('common:core.dataset.import.Data Preprocessing')
title: t('dataset:import_param_setting')
},
{
title: t('common:core.dataset.import.Upload data')
title: t('dataset:import_data_preview')
},
{
title: t('dataset:import_confirm')
}
],
[ImportDataSourceEnum.fileLink]: [
{
title: t('common:core.dataset.import.Select file')
title: t('dataset:import_select_file')
},
{
title: t('common:core.dataset.import.Data Preprocessing')
title: t('dataset:import_param_setting')
},
{
title: t('common:core.dataset.import.Upload data')
title: t('dataset:import_data_preview')
},
{
title: t('dataset:import_confirm')
}
],
[ImportDataSourceEnum.fileCustom]: [
{
title: t('common:core.dataset.import.Select file')
title: t('dataset:import_select_file')
},
{
title: t('common:core.dataset.import.Data Preprocessing')
title: t('dataset:import_param_setting')
},
{
title: t('common:core.dataset.import.Upload data')
title: t('dataset:import_data_preview')
},
{
title: t('dataset:import_confirm')
}
],
[ImportDataSourceEnum.csvTable]: [
{
title: t('common:core.dataset.import.Select file')
title: t('dataset:import_select_file')
},
{
title: t('common:core.dataset.import.Data Preprocessing')
title: t('dataset:import_param_setting')
},
{
title: t('common:core.dataset.import.Upload data')
title: t('dataset:import_data_preview')
},
{
title: t('dataset:import_confirm')
}
],
[ImportDataSourceEnum.externalFile]: [
{
title: t('common:core.dataset.import.Select file')
title: t('dataset:import_select_file')
},
{
title: t('common:core.dataset.import.Data Preprocessing')
title: t('dataset:import_param_setting')
},
{
title: t('common:core.dataset.import.Upload data')
title: t('dataset:import_data_preview')
},
{
title: t('dataset:import_confirm')
}
],
[ImportDataSourceEnum.apiDataset]: [
{
title: t('common:core.dataset.import.Select file')
title: t('dataset:import_select_file')
},
{
title: t('common:core.dataset.import.Data Preprocessing')
title: t('dataset:import_param_setting')
},
{
title: t('common:core.dataset.import.Upload data')
title: t('dataset:import_data_preview')
},
{
title: t('dataset:import_confirm')
}
]
};
@ -168,96 +193,114 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
const processParamsForm = useForm<ImportFormType>({
defaultValues: {
mode: TrainingModeEnum.chunk,
way: ImportProcessWayEnum.auto,
imageIndex: false,
autoIndexes: false,
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
chunkSettingMode: ChunkSettingModeEnum.auto,
embeddingChunkSize: vectorModel?.defaultToken || 512,
qaChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7),
customSplitChar: '',
qaPrompt: Prompt_AgentQA.description,
webSelector: ''
webSelector: '',
customPdfParse: false
}
});
const [sources, setSources] = useState<ImportSourceItemType[]>([]);
// watch form
const mode = processParamsForm.watch('mode');
const way = processParamsForm.watch('way');
const trainingType = processParamsForm.watch('trainingType');
const chunkSettingMode = processParamsForm.watch('chunkSettingMode');
const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize');
const qaChunkSize = processParamsForm.watch('qaChunkSize');
const customSplitChar = processParamsForm.watch('customSplitChar');
const autoIndexes = processParamsForm.watch('autoIndexes');
const modeStaticParams: Record<TrainingModeEnum, TrainingFiledType> = {
[TrainingModeEnum.auto]: {
chunkOverlapRatio: 0.2,
maxChunkSize: 2048,
minChunkSize: 100,
autoChunkSize: vectorModel?.defaultToken ? vectorModel?.defaultToken * 2 : 1024,
chunkSize: vectorModel?.defaultToken ? vectorModel?.defaultToken * 2 : 1024,
showChunkInput: false,
showPromptInput: false,
charsPointsPrice: agentModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
price: agentModel.charsPointsPrice
}),
uploadRate: 100
},
[TrainingModeEnum.chunk]: {
chunkSizeField: 'embeddingChunkSize' as ChunkSizeFieldType,
chunkOverlapRatio: 0.2,
maxChunkSize: vectorModel?.maxToken || 512,
minChunkSize: 100,
autoChunkSize: vectorModel?.defaultToken || 512,
chunkSize: embeddingChunkSize,
showChunkInput: true,
showPromptInput: false,
charsPointsPrice: vectorModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Embedding Estimated Price Tips', {
price: vectorModel.charsPointsPrice
}),
uploadRate: 150
},
[TrainingModeEnum.qa]: {
chunkSizeField: 'qaChunkSize' as ChunkSizeFieldType,
chunkOverlapRatio: 0,
maxChunkSize: Math.min(agentModel.maxResponse * 4, agentModel.maxContext * 0.7),
minChunkSize: 4000,
autoChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7),
chunkSize: qaChunkSize,
showChunkInput: true,
showPromptInput: true,
charsPointsPrice: agentModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
price: agentModel.charsPointsPrice
}),
uploadRate: 30
const TrainingModeMap = useMemo<TrainingFiledType>(() => {
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return {
chunkSizeField: 'qaChunkSize',
chunkOverlapRatio: 0,
maxChunkSize: Math.min(agentModel.maxResponse * 4, agentModel.maxContext * 0.7),
minChunkSize: 4000,
autoChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7),
chunkSize: qaChunkSize,
charsPointsPrice: agentModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
price: agentModel.charsPointsPrice
}),
uploadRate: 30
};
} else if (autoIndexes) {
return {
chunkSizeField: 'embeddingChunkSize',
chunkOverlapRatio: 0.2,
maxChunkSize: 2048,
minChunkSize: 100,
autoChunkSize: vectorModel?.defaultToken ? vectorModel.defaultToken * 2 : 1024,
chunkSize: embeddingChunkSize,
charsPointsPrice: agentModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
price: agentModel.charsPointsPrice
}),
uploadRate: 100
};
} else {
return {
chunkSizeField: 'embeddingChunkSize',
chunkOverlapRatio: 0.2,
maxChunkSize: vectorModel?.maxToken || 512,
minChunkSize: 100,
autoChunkSize: vectorModel?.defaultToken || 512,
chunkSize: embeddingChunkSize,
charsPointsPrice: vectorModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Embedding Estimated Price Tips', {
price: vectorModel.charsPointsPrice
}),
uploadRate: 150
};
}
};
const selectModelStaticParam = modeStaticParams[mode];
}, [
trainingType,
autoIndexes,
agentModel.maxResponse,
agentModel.maxContext,
agentModel.charsPointsPrice,
qaChunkSize,
t,
vectorModel.defaultToken,
vectorModel?.maxToken,
vectorModel.charsPointsPrice,
embeddingChunkSize
]);
const wayStaticPrams = {
[ImportProcessWayEnum.auto]: {
chunkSize: selectModelStaticParam.autoChunkSize,
customSplitChar: ''
},
[ImportProcessWayEnum.custom]: {
chunkSize: modeStaticParams[mode].chunkSize,
customSplitChar
const chunkSettingModeMap = useMemo(() => {
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
return {
chunkSize: TrainingModeMap.autoChunkSize,
customSplitChar: ''
};
} else {
return {
chunkSize: TrainingModeMap.chunkSize,
customSplitChar
};
}
};
const chunkSize = wayStaticPrams[way].chunkSize;
}, [chunkSettingMode, TrainingModeMap.autoChunkSize, TrainingModeMap.chunkSize, customSplitChar]);
const contextValue = {
...TrainingModeMap,
...chunkSettingModeMap,
importSource: source,
parentId,
activeStep,
goToNext,
processParamsForm,
...selectModelStaticParam,
sources,
setSources,
chunkSize
setSources
};
return (

View File

@ -1,4 +1,4 @@
import React, { useCallback, useMemo, useRef } from 'react';
import React, { useCallback, useEffect, useMemo, useRef } from 'react';
import {
Box,
Flex,
@ -7,27 +7,37 @@ import {
ModalBody,
ModalFooter,
Textarea,
useDisclosure
useDisclosure,
Checkbox,
Accordion,
AccordionItem,
AccordionButton,
AccordionPanel,
AccordionIcon,
HStack
} from '@chakra-ui/react';
import MyIcon from '@fastgpt/web/components/common/Icon';
import { useTranslation } from 'next-i18next';
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
import { TrainingModeEnum, TrainingTypeMap } from '@fastgpt/global/core/dataset/constants';
import { ImportProcessWayEnum } from '@/web/core/dataset/constants';
import {
DatasetCollectionDataProcessModeEnum,
DatasetCollectionDataProcessModeMap
} from '@fastgpt/global/core/dataset/constants';
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants';
import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
import { useSystemStore } from '@/web/common/system/useSystemStore';
import MyModal from '@fastgpt/web/components/common/MyModal';
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
import Preview from '../components/Preview';
import MyTag from '@fastgpt/web/components/common/Tag/index';
import { useContextSelector } from 'use-context-selector';
import { DatasetImportContext } from '../Context';
import { useToast } from '@fastgpt/web/hooks/useToast';
import FormLabel from '@fastgpt/web/components/common/MyBox/FormLabel';
import MyNumberInput from '@fastgpt/web/components/common/Input/NumberInput';
import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
import { shadowLight } from '@fastgpt/web/styles/theme';
import AIModelSelector from '@/components/Select/AIModelSelector';
function DataProcess({ showPreviewChunks = true }: { showPreviewChunks: boolean }) {
function DataProcess() {
const { t } = useTranslation();
const { feConfigs } = useSystemStore();
@ -36,16 +46,13 @@ function DataProcess({ showPreviewChunks = true }: { showPreviewChunks: boolean
processParamsForm,
chunkSizeField,
minChunkSize,
showChunkInput,
showPromptInput,
maxChunkSize,
priceTip,
chunkSize
} = useContextSelector(DatasetImportContext, (v) => v);
const { getValues, setValue, register, watch } = processParamsForm;
const { toast } = useToast();
const mode = watch('mode');
const way = watch('way');
const trainingType = watch('trainingType');
const chunkSettingMode = watch('chunkSettingMode');
const {
isOpen: isOpenCustomPrompt,
@ -54,214 +61,315 @@ function DataProcess({ showPreviewChunks = true }: { showPreviewChunks: boolean
} = useDisclosure();
const trainingModeList = useMemo(() => {
const list = Object.entries(TrainingTypeMap);
return list;
const list = Object.entries(DatasetCollectionDataProcessModeMap);
return list
.filter(([key]) => key !== DatasetCollectionDataProcessModeEnum.auto)
.map(([key, value]) => ({
title: t(value.label as any),
value: key as DatasetCollectionDataProcessModeEnum,
tooltip: t(value.tooltip as any)
}));
}, []);
const onSelectTrainWay = useCallback(
(e: TrainingModeEnum) => {
if (!feConfigs?.isPlus && !TrainingTypeMap[e]?.openSource) {
return toast({
status: 'warning',
title: t('common:common.system.Commercial version function')
});
}
setValue('mode', e);
},
[feConfigs?.isPlus, setValue, t, toast]
);
const Title = useCallback(({ title }: { title: string }) => {
return (
<AccordionButton bg={'none !important'} p={2}>
<Box w={'3px'} h={'16px'} bg={'primary.600'} borderRadius={'2px'} mr={2} />
<Box color={'myGray.900'} flex={'1 0 0'} textAlign={'left'}>
{title}
</Box>
<AccordionIcon />
</AccordionButton>
);
}, []);
// Adapt auto training
useEffect(() => {
if (trainingType === DatasetCollectionDataProcessModeEnum.auto) {
setValue('autoIndexes', true);
setValue('trainingType', DatasetCollectionDataProcessModeEnum.chunk);
}
}, [trainingType, setValue]);
const showFileParseSetting = feConfigs?.showCustomPdfParse;
const showQAPromptInput = trainingType === DatasetCollectionDataProcessModeEnum.qa;
return (
<Box h={'100%'} display={['block', 'flex']} fontSize={'sm'}>
<Box
flex={'1 0 0'}
minW={['auto', '500px']}
maxW={'600px'}
h={['auto', '100%']}
overflow={'auto'}
pr={[0, 3]}
>
<Flex alignItems={'center'}>
<MyIcon name={'common/settingLight'} w={'20px'} />
<Box fontSize={'md'}>{t('dataset:data_process_setting')}</Box>
</Flex>
<>
<Box flex={'1 0 0'} maxW={['90vw', '640px']} m={'auto'} overflow={'auto'}>
<Accordion allowMultiple reduceMotion defaultIndex={[0, 1, 2]}>
{showFileParseSetting && (
<AccordionItem border={'none'} borderBottom={'base'} pb={4}>
<Title title={t('dataset:import_file_parse_setting')} />
<Box display={['block', 'flex']} mt={4} alignItems={'center'}>
<FormLabel flex={'0 0 100px'}>{t('dataset:training_mode')}</FormLabel>
<LeftRadio
list={trainingModeList.map(([key, value]) => ({
title: t(value.label as any),
value: key,
tooltip: t(value.tooltip as any)
}))}
px={3}
py={2}
value={mode}
onChange={onSelectTrainWay}
defaultBg="white"
activeBg="white"
display={'flex'}
flexWrap={'wrap'}
/>
</Box>
<Box display={['block', 'flex']} mt={5}>
<FormLabel flex={'0 0 100px'}>{t('dataset:data_process_params')}</FormLabel>
<LeftRadio
list={[
{
title: t('common:core.dataset.import.Auto process'),
desc: t('common:core.dataset.import.Auto process desc'),
value: ImportProcessWayEnum.auto
},
{
title: t('dataset:custom_data_process_params'),
desc: t('dataset:custom_data_process_params_desc'),
value: ImportProcessWayEnum.custom,
children: way === ImportProcessWayEnum.custom && (
<Box mt={5}>
{showChunkInput && chunkSizeField && (
<Box>
<Flex alignItems={'center'}>
<Box>{t('dataset:ideal_chunk_length')}</Box>
<QuestionTip label={t('dataset:ideal_chunk_length_tips')} />
</Flex>
<Box
mt={1}
css={{
'& > span': {
display: 'block'
}
}}
>
<MyTooltip
label={t('common:core.dataset.import.Chunk Range', {
min: minChunkSize,
max: maxChunkSize
})}
>
<MyNumberInput
name={chunkSizeField}
min={minChunkSize}
max={maxChunkSize}
size={'sm'}
step={100}
value={chunkSize}
onChange={(e) => {
if (e === undefined) return;
setValue(chunkSizeField, +e);
}}
/>
</MyTooltip>
</Box>
</Box>
)}
<Box mt={3}>
<Box>
{t('common:core.dataset.import.Custom split char')}
<QuestionTip
label={t('common:core.dataset.import.Custom split char Tips')}
/>
</Box>
<Box mt={1}>
<Input
size={'sm'}
bg={'myGray.50'}
defaultValue={''}
placeholder="\n;======;==SPLIT=="
{...register('customSplitChar')}
/>
</Box>
</Box>
{showPromptInput && (
<Box mt={3}>
<Box>{t('common:core.dataset.collection.QA Prompt')}</Box>
<Box
position={'relative'}
py={2}
px={3}
bg={'myGray.50'}
fontSize={'xs'}
whiteSpace={'pre-wrap'}
border={'1px'}
borderColor={'borderColor.base'}
<AccordionPanel p={2}>
<Flex
flexDirection={'column'}
gap={3}
border={'1px solid'}
borderColor={'primary.600'}
borderRadius={'md'}
boxShadow={shadowLight}
p={4}
>
{feConfigs.showCustomPdfParse && (
<HStack spacing={1}>
<Checkbox {...register('customPdfParse')}>
<FormLabel>{t('dataset:pdf_enhance_parse')}</FormLabel>
</Checkbox>
<QuestionTip label={t('dataset:pdf_enhance_parse_tips')} />
{feConfigs?.show_pay && (
<MyTag
type={'borderSolid'}
borderColor={'myGray.200'}
bg={'myGray.100'}
color={'primary.600'}
py={1.5}
borderRadius={'md'}
maxH={'140px'}
overflow={'auto'}
_hover={{
'& .mask': {
display: 'block'
}
}}
px={3}
whiteSpace={'wrap'}
ml={1}
>
{getValues('qaPrompt')}
{t('dataset:pdf_enhance_parse_price', {
price: feConfigs.customPdfParsePrice || 0
})}
</MyTag>
)}
</HStack>
)}
</Flex>
</AccordionPanel>
</AccordionItem>
)}
<Box
display={'none'}
className="mask"
position={'absolute'}
top={0}
right={0}
bottom={0}
left={0}
background={
'linear-gradient(182deg, rgba(255, 255, 255, 0.00) 1.76%, #FFF 84.07%)'
}
>
<Button
size="xs"
variant={'whiteBase'}
leftIcon={<MyIcon name={'edit'} w={'13px'} />}
color={'black'}
position={'absolute'}
right={2}
bottom={2}
onClick={onOpenCustomPrompt}
>
{t('common:core.dataset.import.Custom prompt')}
</Button>
</Box>
</Box>
</Box>
)}
<AccordionItem mt={4} border={'none'}>
<Title title={t('dataset:import_data_process_setting')} />
<AccordionPanel p={2}>
<Box mt={2}>
<Box fontSize={'sm'} mb={2} color={'myGray.600'}>
{t('dataset:training_mode')}
</Box>
<LeftRadio<DatasetCollectionDataProcessModeEnum>
list={trainingModeList}
px={3}
py={2.5}
value={trainingType}
onChange={(e) => {
setValue('trainingType', e);
}}
defaultBg="white"
activeBg="white"
gridTemplateColumns={'repeat(2, 1fr)'}
/>
</Box>
{trainingType === DatasetCollectionDataProcessModeEnum.chunk && (
<Box mt={6}>
<Box fontSize={'sm'} mb={2} color={'myGray.600'}>
{t('dataset:enhanced_indexes')}
</Box>
)
}
]}
px={3}
py={3}
defaultBg="white"
activeBg="white"
value={way}
w={'100%'}
onChange={(e) => {
setValue('way', e);
}}
></LeftRadio>
</Box>
{feConfigs?.isPlus && (
<HStack gap={[3, 7]}>
<HStack flex={'1'} spacing={1}>
<Checkbox {...register('autoIndexes')}>
<FormLabel>{t('dataset:auto_indexes')}</FormLabel>
</Checkbox>
<QuestionTip label={t('dataset:auto_indexes_tips')} />
</HStack>
<HStack flex={'1'} spacing={1}>
<Checkbox {...register('imageIndex')}>
<FormLabel>{t('dataset:image_auto_parse')}</FormLabel>
</Checkbox>
<QuestionTip label={t('dataset:image_auto_parse_tips')} />
</HStack>
</HStack>
)}
</Box>
)}
<Box mt={6}>
<Box fontSize={'sm'} mb={2} color={'myGray.600'}>
{t('dataset:params_setting')}
</Box>
<LeftRadio<ChunkSettingModeEnum>
list={[
{
title: t('dataset:default_params'),
desc: t('dataset:default_params_desc'),
value: ChunkSettingModeEnum.auto
},
{
title: t('dataset:custom_data_process_params'),
desc: t('dataset:custom_data_process_params_desc'),
value: ChunkSettingModeEnum.custom,
children: chunkSettingMode === ChunkSettingModeEnum.custom && (
<Box mt={5}>
<Box>
<Flex alignItems={'center'}>
<Box>{t('dataset:ideal_chunk_length')}</Box>
<QuestionTip label={t('dataset:ideal_chunk_length_tips')} />
</Flex>
<Box
mt={1}
css={{
'& > span': {
display: 'block'
}
}}
>
<MyTooltip
label={t('common:core.dataset.import.Chunk Range', {
min: minChunkSize,
max: maxChunkSize
})}
>
<MyNumberInput
register={register}
name={chunkSizeField}
min={minChunkSize}
max={maxChunkSize}
size={'sm'}
step={100}
/>
</MyTooltip>
</Box>
</Box>
{feConfigs?.show_pay && (
<Box mt={5} pl={[0, '100px']} gap={3}>
<MyTag colorSchema={'gray'} py={1.5} borderRadius={'md'} px={3} whiteSpace={'wrap'}>
{priceTip}
</MyTag>
</Box>
)}
<Box mt={3}>
<Box>
{t('common:core.dataset.import.Custom split char')}
<QuestionTip
label={t('common:core.dataset.import.Custom split char Tips')}
/>
</Box>
<Box mt={1}>
<Input
size={'sm'}
bg={'myGray.50'}
defaultValue={''}
placeholder="\n;======;==SPLIT=="
{...register('customSplitChar')}
/>
</Box>
</Box>
<Flex mt={5} gap={3} justifyContent={'flex-end'}>
<Button
onClick={() => {
goToNext();
}}
>
{t('common:common.Next Step')}
</Button>
</Flex>
</Box>
<Box flex={'1 0 0'} w={['auto', '0']} h={['auto', '100%']} pl={[0, 3]}>
<Preview showPreviewChunks={showPreviewChunks} />
{showQAPromptInput && (
<Box mt={3}>
<Box>{t('common:core.dataset.collection.QA Prompt')}</Box>
<Box
position={'relative'}
py={2}
px={3}
bg={'myGray.50'}
fontSize={'xs'}
whiteSpace={'pre-wrap'}
border={'1px'}
borderColor={'borderColor.base'}
borderRadius={'md'}
maxH={'140px'}
overflow={'auto'}
_hover={{
'& .mask': {
display: 'block'
}
}}
>
{getValues('qaPrompt')}
<Box
display={'none'}
className="mask"
position={'absolute'}
top={0}
right={0}
bottom={0}
left={0}
background={
'linear-gradient(182deg, rgba(255, 255, 255, 0.00) 1.76%, #FFF 84.07%)'
}
>
<Button
size="xs"
variant={'whiteBase'}
leftIcon={<MyIcon name={'edit'} w={'13px'} />}
color={'black'}
position={'absolute'}
right={2}
bottom={2}
onClick={onOpenCustomPrompt}
>
{t('common:core.dataset.import.Custom prompt')}
</Button>
</Box>
</Box>
</Box>
)}
</Box>
)
}
]}
gridGap={3}
px={3}
py={3}
defaultBg="white"
activeBg="white"
value={chunkSettingMode}
w={'100%'}
onChange={(e) => {
setValue('chunkSettingMode', e);
}}
/>
</Box>
</AccordionPanel>
</AccordionItem>
{/* <AccordionItem mt={4} border={'none'}>
<Title title={t('dataset:import_model_config')} />
<AccordionPanel p={2} fontSize={'sm'}>
<Box>
<Box>{t('common:core.ai.model.Dataset Agent Model')}</Box>
<Box mt={1}>
<AIModelSelector
w={'100%'}
value={llmModel}
list={datasetModelList.map((item) => ({
label: item.name,
value: item.model
}))}
onchange={(e) => {
setValue('llmModel', e);
}}
/>
</Box>
</Box>
<Box pt={5}>
<Box>{t('dataset:vllm_model')}</Box>
<Box mt={1}>
<AIModelSelector
w={'100%'}
value={vlmModel}
list={vllmModelList.map((item) => ({
label: item.name,
value: item.model
}))}
onchange={(e) => {
setValue('vlmModel', e);
}}
/>
</Box>
</Box>
</AccordionPanel>
</AccordionItem> */}
<Flex mt={5} gap={3} justifyContent={'flex-end'}>
<Button
onClick={() => {
goToNext();
}}
>
{t('common:common.Next Step')}
</Button>
</Flex>
</Accordion>
</Box>
{isOpenCustomPrompt && (
@ -273,7 +381,7 @@ function DataProcess({ showPreviewChunks = true }: { showPreviewChunks: boolean
onClose={onCloseCustomPrompt}
/>
)}
</Box>
</>
);
}

View File

@ -1,19 +1,160 @@
import React from 'react';
import Preview from '../components/Preview';
import { Box, Button, Flex } from '@chakra-ui/react';
import React, { useState } from 'react';
import { Box, Button, Flex, HStack } from '@chakra-ui/react';
import { useTranslation } from 'next-i18next';
import { useContextSelector } from 'use-context-selector';
import { DatasetImportContext } from '../Context';
import MyIcon from '@fastgpt/web/components/common/Icon';
import FormLabel from '@fastgpt/web/components/common/MyBox/FormLabel';
import EmptyTip from '@fastgpt/web/components/common/EmptyTip';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { getPreviewChunks } from '@/web/core/dataset/api';
import { ImportSourceItemType } from '@/web/core/dataset/type';
import { getPreviewSourceReadType } from '../utils';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import MyBox from '@fastgpt/web/components/common/MyBox';
import Markdown from '@/components/Markdown';
import { useToast } from '@fastgpt/web/hooks/useToast';
const PreviewData = ({ showPreviewChunks }: { showPreviewChunks: boolean }) => {
const PreviewData = () => {
const { t } = useTranslation();
const { toast } = useToast();
const goToNext = useContextSelector(DatasetImportContext, (v) => v.goToNext);
const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId);
const sources = useContextSelector(DatasetImportContext, (v) => v.sources);
const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource);
const chunkSize = useContextSelector(DatasetImportContext, (v) => v.chunkSize);
const chunkOverlapRatio = useContextSelector(DatasetImportContext, (v) => v.chunkOverlapRatio);
const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm);
const [previewFile, setPreviewFile] = useState<ImportSourceItemType>();
const { data = [], loading: isLoading } = useRequest2(
async () => {
if (!previewFile) return;
if (importSource === ImportDataSourceEnum.fileCustom) {
const customSplitChar = processParamsForm.getValues('customSplitChar');
const { chunks } = splitText2Chunks({
text: previewFile.rawText || '',
chunkLen: chunkSize,
overlapRatio: chunkOverlapRatio,
customReg: customSplitChar ? [customSplitChar] : []
});
return chunks.map((chunk) => ({
q: chunk,
a: ''
}));
}
return getPreviewChunks({
datasetId,
type: getPreviewSourceReadType(previewFile),
sourceId:
previewFile.dbFileId ||
previewFile.link ||
previewFile.externalFileUrl ||
previewFile.apiFileId ||
'',
customPdfParse: processParamsForm.getValues('customPdfParse'),
chunkSize,
overlapRatio: chunkOverlapRatio,
customSplitChar: processParamsForm.getValues('customSplitChar'),
selector: processParamsForm.getValues('webSelector'),
isQAImport: importSource === ImportDataSourceEnum.csvTable,
externalFileId: previewFile.externalFileId
});
},
{
refreshDeps: [previewFile],
manual: false,
onSuccess(result) {
if (!previewFile) return;
if (!result || result.length === 0) {
toast({
title: t('dataset:preview_chunk_empty'),
status: 'error'
});
}
}
}
);
return (
<Flex flexDirection={'column'} h={'100%'}>
<Box flex={'1 0 0 '}>
<Preview showPreviewChunks={showPreviewChunks} />
</Box>
<Flex flex={'1 0 0'} border={'base'} borderRadius={'md'}>
<Flex flexDirection={'column'} flex={'1 0 0'} borderRight={'base'}>
<FormLabel fontSize={'md'} py={4} px={5} borderBottom={'base'}>
{t('dataset:file_list')}
</FormLabel>
<Box flex={'1 0 0'} overflowY={'auto'} px={5} py={3}>
{sources.map((source) => (
<HStack
key={source.id}
bg={'myGray.50'}
p={4}
borderRadius={'md'}
borderWidth={'1px'}
borderColor={'transparent'}
cursor={'pointer'}
_hover={{
borderColor: 'primary.300'
}}
{...(previewFile?.id === source.id && {
borderColor: 'primary.500 !important',
bg: 'primary.50 !important'
})}
_notLast={{ mb: 3 }}
onClick={() => setPreviewFile(source)}
>
<MyIcon name={source.icon as any} w={'1.25rem'} />
<Box ml={1} flex={'1 0 0'} wordBreak={'break-all'} fontSize={'sm'}>
{source.sourceName}
</Box>
</HStack>
))}
</Box>
</Flex>
<Flex flexDirection={'column'} flex={'1 0 0'}>
<Flex py={4} px={5} borderBottom={'base'} justifyContent={'space-between'}>
<FormLabel fontSize={'md'}>{t('dataset:preview_chunk')}</FormLabel>
<Box fontSize={'xs'} color={'myGray.500'}>
{t('dataset:preview_chunk_intro')}
</Box>
</Flex>
<MyBox isLoading={isLoading} flex={'1 0 0'} overflowY={'auto'} px={5} py={3}>
{previewFile ? (
<>
{data.map((item, index) => (
<Box
key={index}
fontSize={'sm'}
color={'myGray.600'}
_notLast={{
mb: 3,
pb: 3,
borderBottom: 'base'
}}
_hover={{
bg: 'myGray.100'
}}
>
<Markdown source={item.q} />
<Markdown source={item.a} />
</Box>
))}
</>
) : (
<EmptyTip text={t('dataset:preview_chunk_not_selected')} />
)}
</MyBox>
</Flex>
</Flex>
<Flex mt={2} justifyContent={'flex-end'}>
<Button onClick={goToNext}>{t('common:common.Next Step')}</Button>
</Flex>

View File

@ -14,7 +14,10 @@ import {
IconButton,
Tooltip
} from '@chakra-ui/react';
import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants';
import {
DatasetCollectionDataProcessModeEnum,
ImportDataSourceEnum
} from '@fastgpt/global/core/dataset/constants';
import { useTranslation } from 'next-i18next';
import MyIcon from '@fastgpt/web/components/common/Icon';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
@ -34,6 +37,7 @@ import MyTag from '@fastgpt/web/components/common/Tag/index';
import { useContextSelector } from 'use-context-selector';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { DatasetImportContext, type ImportFormType } from '../Context';
import { ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
const Upload = () => {
const { t } = useTranslation();
@ -77,7 +81,7 @@ const Upload = () => {
}, [waitingFilesCount, totalFilesCount, allFinished, t]);
const { runAsync: startUpload, loading: isLoading } = useRequest2(
async ({ mode, customSplitChar, qaPrompt, webSelector }: ImportFormType) => {
async ({ trainingType, customSplitChar, qaPrompt, webSelector }: ImportFormType) => {
if (sources.length === 0) return;
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
@ -95,15 +99,21 @@ const Upload = () => {
);
// create collection
const commonParams = {
const commonParams: ApiCreateDatasetCollectionParams & {
name: string;
} = {
parentId,
trainingType: mode,
datasetId: datasetDetail._id,
name: item.sourceName,
customPdfParse: processParamsForm.getValues('customPdfParse'),
trainingType,
imageIndex: processParamsForm.getValues('imageIndex'),
autoIndexes: processParamsForm.getValues('autoIndexes'),
chunkSize,
chunkSplitter: customSplitChar,
qaPrompt,
name: item.sourceName
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
};
if (importSource === ImportDataSourceEnum.reTraining) {
const res = await postReTrainingDatasetFileCollection({
@ -272,7 +282,7 @@ const Upload = () => {
<Flex justifyContent={'flex-end'} mt={4}>
<Button isLoading={isLoading} onClick={handleSubmit((data) => startUpload(data))}>
{totalFilesCount > 0 &&
`${t('common:core.dataset.import.Total files', {
`${t('dataset:total_num_files', {
total: totalFilesCount
})} | `}
{buttonText}

View File

@ -1,102 +0,0 @@
import React, { useState } from 'react';
import { Box, Flex, Grid, IconButton } from '@chakra-ui/react';
import MyIcon from '@fastgpt/web/components/common/Icon';
import { useTranslation } from 'next-i18next';
import MyMenu from '@fastgpt/web/components/common/MyMenu';
import { ImportSourceItemType } from '@/web/core/dataset/type';
import dynamic from 'next/dynamic';
import { useContextSelector } from 'use-context-selector';
import { DatasetImportContext } from '../Context';
const PreviewRawText = dynamic(() => import('./PreviewRawText'));
const PreviewChunks = dynamic(() => import('./PreviewChunks'));
const Preview = ({ showPreviewChunks }: { showPreviewChunks: boolean }) => {
const { t } = useTranslation();
const { sources } = useContextSelector(DatasetImportContext, (v) => v);
const [previewRawTextSource, setPreviewRawTextSource] = useState<ImportSourceItemType>();
const [previewChunkSource, setPreviewChunkSource] = useState<ImportSourceItemType>();
return (
<Box h={'100%'} w={'100%'} display={['block', 'flex']} flexDirection={'column'}>
<Flex alignItems={'center'}>
<MyIcon name={'core/dataset/fileCollection'} w={'20px'} />
<Box fontSize={'md'}>{t('common:core.dataset.import.Sources list')}</Box>
</Flex>
<Box mt={3} flex={'1 0 0'} h={['auto', 0]} width={'100%'} overflowY={'auto'}>
<Grid w={'100%'} gap={3} gridTemplateColumns={['1fr', '1fr', '1fr', '1fr', '1fr 1fr']}>
{sources.map((source) => (
<Flex
key={source.id}
bg={'white'}
p={4}
borderRadius={'md'}
borderWidth={'1px'}
borderColor={'borderColor.low'}
boxShadow={'2'}
alignItems={'center'}
>
<MyIcon name={source.icon as any} w={['1rem', '1.25rem']} />
<Box mx={1} flex={'1 0 0'} wordBreak={'break-all'} fontSize={'sm'}>
{source.sourceName}
</Box>
{showPreviewChunks && (
<Box fontSize={'xs'} color={'myGray.600'}>
<MyMenu
Button={
<IconButton
icon={<MyIcon name={'common/viewLight'} w={'14px'} p={2} />}
aria-label={''}
size={'sm'}
variant={'whitePrimary'}
/>
}
menuList={[
{
children: [
{
label: (
<Flex alignItems={'center'}>
<MyIcon name={'core/dataset/fileCollection'} w={'14px'} mr={2} />
{t('common:core.dataset.import.Preview raw text')}
</Flex>
),
onClick: () => setPreviewRawTextSource(source)
},
{
label: (
<Flex alignItems={'center'}>
<MyIcon name={'core/dataset/splitLight'} w={'14px'} mr={2} />
{t('common:core.dataset.import.Preview chunks')}
</Flex>
),
onClick: () => setPreviewChunkSource(source)
}
]
}
]}
/>
</Box>
)}
</Flex>
))}
</Grid>
</Box>
{!!previewRawTextSource && (
<PreviewRawText
previewSource={previewRawTextSource}
onClose={() => setPreviewRawTextSource(undefined)}
/>
)}
{!!previewChunkSource && (
<PreviewChunks
previewSource={previewChunkSource}
onClose={() => setPreviewChunkSource(undefined)}
/>
)}
</Box>
);
};
export default React.memo(Preview);

View File

@ -1,78 +0,0 @@
import React from 'react';
import { Box } from '@chakra-ui/react';
import { ImportSourceItemType } from '@/web/core/dataset/type';
import { getPreviewFileContent } from '@/web/common/file/api';
import MyRightDrawer from '@fastgpt/web/components/common/MyDrawer/MyRightDrawer';
import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants';
import { useToast } from '@fastgpt/web/hooks/useToast';
import { getErrText } from '@fastgpt/global/common/error/utils';
import { useContextSelector } from 'use-context-selector';
import { DatasetImportContext } from '../Context';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
import { getPreviewSourceReadType } from '../utils';
const PreviewRawText = ({
previewSource,
onClose
}: {
previewSource: ImportSourceItemType;
onClose: () => void;
}) => {
const { toast } = useToast();
const { importSource, processParamsForm } = useContextSelector(DatasetImportContext, (v) => v);
const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId);
const { data, loading: isLoading } = useRequest2(
async () => {
if (importSource === ImportDataSourceEnum.fileCustom && previewSource.rawText) {
return {
previewContent: previewSource.rawText.slice(0, 3000)
};
}
return getPreviewFileContent({
datasetId,
type: getPreviewSourceReadType(previewSource),
sourceId:
previewSource.dbFileId ||
previewSource.link ||
previewSource.externalFileUrl ||
previewSource.apiFileId ||
'',
isQAImport: importSource === ImportDataSourceEnum.csvTable,
selector: processParamsForm.getValues('webSelector'),
externalFileId: previewSource.externalFileId
});
},
{
refreshDeps: [previewSource.dbFileId, previewSource.link, previewSource.externalFileUrl],
manual: false,
onError(err) {
toast({
status: 'warning',
title: getErrText(err)
});
}
}
);
const rawText = data?.previewContent || '';
return (
<MyRightDrawer
onClose={onClose}
iconSrc={previewSource.icon}
title={previewSource.sourceName}
isLoading={isLoading}
px={0}
>
<Box whiteSpace={'pre-wrap'} overflowY={'auto'} px={5} fontSize={'sm'}>
{rawText}
</Box>
</MyRightDrawer>
);
};
export default React.memo(PreviewRawText);

View File

@ -14,24 +14,17 @@ import {
import { ImportSourceItemType } from '@/web/core/dataset/type.d';
import MyIcon from '@fastgpt/web/components/common/Icon';
import { useTranslation } from 'next-i18next';
import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
import dynamic from 'next/dynamic';
import { useI18n } from '@/web/context/I18n';
const PreviewRawText = dynamic(() => import('./PreviewRawText'));
export const RenderUploadFiles = ({
files,
setFiles,
showPreviewContent
setFiles
}: {
files: ImportSourceItemType[];
setFiles: React.Dispatch<React.SetStateAction<ImportSourceItemType[]>>;
showPreviewContent?: boolean;
}) => {
const { t } = useTranslation();
const { fileT } = useI18n();
const [previewFile, setPreviewFile] = useState<ImportSourceItemType>();
return files.length > 0 ? (
<>
@ -84,18 +77,6 @@ export const RenderUploadFiles = ({
<Td>
{!item.isUploading && (
<Flex alignItems={'center'} gap={4}>
{showPreviewContent && (
<MyTooltip label={t('common:core.dataset.import.Preview raw text')}>
<IconButton
variant={'whitePrimary'}
size={'sm'}
icon={<MyIcon name={'common/viewLight'} w={'18px'} />}
aria-label={''}
onClick={() => setPreviewFile(item)}
/>
</MyTooltip>
)}
<IconButton
variant={'grayDanger'}
size={'sm'}
@ -113,9 +94,6 @@ export const RenderUploadFiles = ({
</Tbody>
</Table>
</TableContainer>
{!!previewFile && (
<PreviewRawText previewSource={previewFile} onClose={() => setPreviewFile(undefined)} />
)}
</>
) : null;
};

View File

@ -28,7 +28,7 @@ const APIDatasetCollection = () => {
return (
<>
{activeStep === 0 && <CustomAPIFileInput />}
{activeStep === 1 && <DataProcess showPreviewChunks={true} />}
{activeStep === 1 && <DataProcess />}
{activeStep === 2 && <Upload />}
</>
);
@ -272,7 +272,7 @@ const CustomAPIFileInput = () => {
onClick={onclickNext}
>
{selectFiles.length > 0
? `${t('common:core.dataset.import.Total files', { total: selectFiles.length })} | `
? `${t('dataset:total_num_files', { total: selectFiles.length })} | `
: ''}
{t('common:common.Next Step')}
</Button>

View File

@ -34,7 +34,7 @@ const ExternalFileCollection = () => {
return (
<>
{activeStep === 0 && <CustomLinkInput />}
{activeStep === 1 && <DataProcess showPreviewChunks={true} />}
{activeStep === 1 && <DataProcess />}
{activeStep === 2 && <Upload />}
</>
);

View File

@ -19,7 +19,7 @@ const CustomTet = () => {
return (
<>
{activeStep === 0 && <CustomTextInput />}
{activeStep === 1 && <DataProcess showPreviewChunks />}
{activeStep === 1 && <DataProcess />}
{activeStep === 2 && <Upload />}
</>
);

View File

@ -23,7 +23,7 @@ const LinkCollection = () => {
return (
<>
{activeStep === 0 && <CustomLinkImport />}
{activeStep === 1 && <DataProcess showPreviewChunks />}
{activeStep === 1 && <DataProcess />}
{activeStep === 2 && <Upload />}
</>
);

View File

@ -10,9 +10,8 @@ import { RenderUploadFiles } from '../components/RenderFiles';
import { useContextSelector } from 'use-context-selector';
import { DatasetImportContext } from '../Context';
const DataProcess = dynamic(() => import('../commonProgress/DataProcess'), {
loading: () => <Loading fixed={false} />
});
const DataProcess = dynamic(() => import('../commonProgress/DataProcess'));
const PreviewData = dynamic(() => import('../commonProgress/PreviewData'));
const Upload = dynamic(() => import('../commonProgress/Upload'));
const fileType = '.txt, .docx, .csv, .xlsx, .pdf, .md, .html, .pptx';
@ -23,8 +22,9 @@ const FileLocal = () => {
return (
<>
{activeStep === 0 && <SelectFile />}
{activeStep === 1 && <DataProcess showPreviewChunks />}
{activeStep === 2 && <Upload />}
{activeStep === 1 && <DataProcess />}
{activeStep === 2 && <PreviewData />}
{activeStep === 3 && <Upload />}
</>
);
};
@ -64,12 +64,12 @@ const SelectFile = React.memo(function SelectFile() {
/>
{/* render files */}
<RenderUploadFiles files={selectFiles} setFiles={setSelectFiles} showPreviewContent />
<RenderUploadFiles files={selectFiles} setFiles={setSelectFiles} />
<Box textAlign={'right'} mt={5}>
<Button isDisabled={successFiles.length === 0 || uploading} onClick={onclickNext}>
{selectFiles.length > 0
? `${t('core.dataset.import.Total files', { total: selectFiles.length })} | `
? `${t('dataset:total_num_files', { total: selectFiles.length })} | `
: ''}
{t('common:common.Next Step')}
</Button>

View File

@ -8,10 +8,13 @@ import { useRouter } from 'next/router';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
import { getDatasetCollectionById } from '@/web/core/dataset/api';
import MyBox from '@fastgpt/web/components/common/MyBox';
import { ImportProcessWayEnum } from '@/web/core/dataset/constants';
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants';
import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { Box } from '@chakra-ui/react';
const Upload = dynamic(() => import('../commonProgress/Upload'));
const PreviewData = dynamic(() => import('../commonProgress/PreviewData'));
const ReTraining = () => {
const router = useRouter();
@ -20,6 +23,7 @@ const ReTraining = () => {
collectionId: string;
};
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
const activeStep = useContextSelector(DatasetImportContext, (v) => v.activeStep);
const setSources = useContextSelector(DatasetImportContext, (v) => v.setSources);
const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm);
@ -43,8 +47,12 @@ const ReTraining = () => {
}
]);
processParamsForm.reset({
mode: collection.trainingType,
way: ImportProcessWayEnum.auto,
customPdfParse: collection.customPdfParse,
trainingType: collection.trainingType,
imageIndex: collection.imageIndex,
autoIndexes: collection.autoIndexes,
chunkSettingMode: ChunkSettingModeEnum.auto,
embeddingChunkSize: collection.chunkSize,
qaChunkSize: collection.chunkSize,
customSplitChar: collection.chunkSplitter,
@ -55,9 +63,12 @@ const ReTraining = () => {
});
return (
<MyBox isLoading={loading} h={'100%'} overflow={'auto'}>
{activeStep === 0 && <DataProcess showPreviewChunks={true} />}
{activeStep === 1 && <Upload />}
<MyBox isLoading={loading} h={'100%'}>
<Box h={'100%'} overflow={'auto'}>
{activeStep === 0 && <DataProcess />}
{activeStep === 1 && <PreviewData />}
{activeStep === 2 && <Upload />}
</Box>
</MyBox>
);
};

View File

@ -21,7 +21,7 @@ const FileLocal = () => {
return (
<>
{activeStep === 0 && <SelectFile />}
{activeStep === 1 && <PreviewData showPreviewChunks />}
{activeStep === 1 && <PreviewData />}
{activeStep === 2 && <Upload />}
</>
);
@ -91,7 +91,7 @@ const SelectFile = React.memo(function SelectFile() {
}}
>
{selectFiles.length > 0
? `${t('core.dataset.import.Total files', { total: selectFiles.length })} | `
? `${t('dataset:total_num_files', { total: selectFiles.length })} | `
: ''}
{t('common:common.Next Step')}
</Button>

View File

@ -1,4 +1,4 @@
import React, { useEffect, useState } from 'react';
import React, { useEffect, useMemo, useState } from 'react';
import { Box, Flex, Switch, Input } from '@chakra-ui/react';
import { useConfirm } from '@fastgpt/web/hooks/useConfirm';
import { useForm } from 'react-hook-form';
@ -37,6 +37,8 @@ const Info = ({ datasetId }: { datasetId: string }) => {
const { t } = useTranslation();
const { datasetDetail, loadDatasetDetail, updateDataset, rebuildingCount, trainingCount } =
useContextSelector(DatasetPageContext, (v) => v);
const { feConfigs, datasetModelList, embeddingModelList, getVllmModelList } = useSystemStore();
const [editedDataset, setEditedDataset] = useState<EditResourceInfoFormType>();
const [editedAPIDataset, setEditedAPIDataset] = useState<EditAPIDatasetInfoFormType>();
const refetchDatasetTraining = useContextSelector(
@ -50,7 +52,9 @@ const Info = ({ datasetId }: { datasetId: string }) => {
const vectorModel = watch('vectorModel');
const agentModel = watch('agentModel');
const { feConfigs, datasetModelList, embeddingModelList } = useSystemStore();
const vllmModelList = useMemo(() => getVllmModelList(), [getVllmModelList]);
const vlmModel = watch('vlmModel');
const { ConfirmModal: ConfirmDelModal } = useConfirm({
content: t('common:core.dataset.Delete Confirm'),
type: 'delete'
@ -69,7 +73,8 @@ const Info = ({ datasetId }: { datasetId: string }) => {
(data: DatasetItemType) => {
return updateDataset({
id: datasetId,
agentModel: data.agentModel,
agentModel: data.agentModel?.model,
vlmModel: data.vlmModel?.model,
externalReadUrl: data.externalReadUrl
});
},
@ -225,6 +230,31 @@ const Info = ({ datasetId }: { datasetId: string }) => {
</Box>
</Box>
{feConfigs?.isPlus && (
<Box pt={5}>
<FormLabel fontSize={'mini'} fontWeight={'500'}>
{t('dataset:vllm_model')}
</FormLabel>
<Box pt={2}>
<AIModelSelector
w={'100%'}
value={vlmModel?.model}
list={vllmModelList.map((item) => ({
label: item.name,
value: item.model
}))}
fontSize={'mini'}
onchange={(e) => {
const vlmModel = vllmModelList.find((item) => item.model === e);
if (!vlmModel) return;
setValue('vlmModel', vlmModel);
return handleSubmit((data) => onSave({ ...data, vlmModel }))();
}}
/>
</Box>
</Box>
)}
{feConfigs?.isPlus && (
<Flex alignItems={'center'} pt={5}>
<FormLabel fontSize={'mini'} fontWeight={'500'}>

View File

@ -1,9 +1,7 @@
import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react';
import { Box, Flex, Button, Textarea, useTheme, Grid, HStack } from '@chakra-ui/react';
import { Box, Flex, Button, Textarea, useTheme } from '@chakra-ui/react';
import {
Control,
FieldArrayWithId,
UseFieldArrayAppend,
UseFieldArrayRemove,
UseFormRegister,
useFieldArray,
@ -12,7 +10,6 @@ import {
import {
postInsertData2Dataset,
putDatasetDataById,
delOneDatasetDataById,
getDatasetCollectionById,
getDatasetDataItemById
} from '@/web/core/dataset/api';
@ -24,7 +21,7 @@ import { useQuery } from '@tanstack/react-query';
import { useTranslation } from 'next-i18next';
import { useRequest, useRequest2 } from '@fastgpt/web/hooks/useRequest';
import { useConfirm } from '@fastgpt/web/hooks/useConfirm';
import { getDefaultIndex, getSourceNameIcon } from '@fastgpt/global/core/dataset/utils';
import { getSourceNameIcon } from '@fastgpt/global/core/dataset/utils';
import { DatasetDataIndexItemType } from '@fastgpt/global/core/dataset/type';
import DeleteIcon from '@fastgpt/web/components/common/Icon/delete';
import { defaultCollectionDetail } from '@/web/core/dataset/constants';
@ -36,6 +33,7 @@ import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
import { useSystem } from '@fastgpt/web/hooks/useSystem';
import LightRowTabs from '@fastgpt/web/components/common/Tabs/LightRowTabs';
import styles from './styles.module.scss';
import { getDatasetIndexMapData } from '@fastgpt/global/core/dataset/data/constants';
export type InputDataType = {
q: string;
@ -218,10 +216,7 @@ const InputDataModal = ({
await putDatasetDataById({
dataId,
...e,
indexes:
e.indexes?.map((index) =>
index.defaultIndex ? getDefaultIndex({ q: e.q, a: e.a, dataId: index.dataId }) : index
) || []
indexes: e.indexes
});
return {
@ -296,7 +291,7 @@ const InputDataModal = ({
p={0}
onClick={() =>
appendIndexes({
defaultIndex: false,
type: 'custom',
text: '',
dataId: `${Date.now()}`
})
@ -315,7 +310,6 @@ const InputDataModal = ({
<DataIndex
register={register}
maxToken={maxToken}
appendIndexes={appendIndexes}
removeIndexes={removeIndexes}
indexes={indexes}
/>
@ -424,13 +418,11 @@ const DataIndex = ({
maxToken,
register,
indexes,
appendIndexes,
removeIndexes
}: {
maxToken: number;
register: UseFormRegister<InputDataType>;
indexes: FieldArrayWithId<InputDataType, 'indexes', 'id'>[];
appendIndexes: UseFieldArrayAppend<InputDataType, 'indexes'>;
removeIndexes: UseFieldArrayRemove;
}) => {
const { t } = useTranslation();
@ -438,52 +430,41 @@ const DataIndex = ({
return (
<>
<Flex mt={3} gap={3} flexDir={'column'}>
<Box
p={4}
borderRadius={'md'}
border={'1.5px solid var(--light-fastgpt-primary-opacity-01, rgba(51, 112, 255, 0.10))'}
bg={'primary.50'}
>
<Flex mb={2}>
<Box flex={1} fontWeight={'medium'} fontSize={'sm'} color={'primary.700'}>
{t('common:dataset.data.Default Index')}
</Box>
</Flex>
<Box fontSize={'sm'} fontWeight={'medium'} color={'myGray.600'}>
{t('common:core.dataset.data.Default Index Tip')}
</Box>
</Box>
{indexes?.map((index, i) => {
const data = getDatasetIndexMapData(index.type);
return (
!index.defaultIndex && (
<Box
key={index.dataId || i}
p={4}
borderRadius={'md'}
border={'1.5px solid var(--Gray-Modern-200, #E8EBF0)'}
bg={'myGray.25'}
_hover={{
'& .delete': {
display: 'block'
}
}}
>
<Flex mb={2}>
<Box flex={1} fontWeight={'medium'} fontSize={'sm'} color={'myGray.900'}>
{t('dataset.data.Custom Index Number', { number: i })}
</Box>
<Box
key={index.dataId || i}
p={4}
borderRadius={'md'}
border={'1.5px solid var(--Gray-Modern-200, #E8EBF0)'}
bg={'myGray.25'}
_hover={{
'& .delete': {
display: 'block'
}
}}
>
<Flex mb={2}>
<Box flex={1} fontWeight={'medium'} fontSize={'sm'} color={'myGray.900'}>
{t(data.label)}
</Box>
{index.type !== 'default' && (
<DeleteIcon
onClick={() => {
if (indexes.length <= 1) {
appendIndexes(getDefaultIndex({ dataId: `${Date.now()}` }));
}
removeIndexes(i);
}}
/>
</Flex>
<DataIndexTextArea index={i} maxToken={maxToken} register={register} />
</Box>
)
)}
</Flex>
<DataIndexTextArea
disabled={index.type === 'default'}
index={i}
value={index.text}
maxToken={maxToken}
register={register}
/>
</Box>
);
})}
</Flex>
@ -491,14 +472,19 @@ const DataIndex = ({
);
};
const textareaMinH = '40px';
const DataIndexTextArea = ({
value,
index,
maxToken,
register
register,
disabled
}: {
value: string;
index: number;
maxToken: number;
register: UseFormRegister<InputDataType>;
disabled?: boolean;
}) => {
const { t } = useTranslation();
const TextareaDom = useRef<HTMLTextAreaElement | null>(null);
@ -509,7 +495,7 @@ const DataIndexTextArea = ({
onChange: onTextChange,
onBlur
} = register(`indexes.${index}.text`, { required: true });
const textareaMinH = '40px';
useEffect(() => {
if (TextareaDom.current) {
TextareaDom.current.style.height = textareaMinH;
@ -522,7 +508,12 @@ const DataIndexTextArea = ({
e.target.style.height = `${e.target.scrollHeight + 5}px`;
}
}, []);
return (
return disabled ? (
<Box fontSize={'sm'} color={'myGray.500'} whiteSpace={'pre-wrap'}>
{value}
</Box>
) : (
<Textarea
maxLength={maxToken}
borderColor={'transparent'}

View File

@ -7,7 +7,10 @@ import { useRouter } from 'next/router';
import MyBox from '@fastgpt/web/components/common/MyBox';
import { formatFileSize } from '@fastgpt/global/common/file/tools';
import { formatTime2YMDHM } from '@fastgpt/global/common/string/time';
import { DatasetCollectionTypeMap, TrainingTypeMap } from '@fastgpt/global/core/dataset/constants';
import {
DatasetCollectionDataProcessModeMap,
DatasetCollectionTypeMap
} from '@fastgpt/global/core/dataset/constants';
import { getCollectionSourceAndOpen } from '@/web/core/dataset/hooks/readCollectionSource';
import MyIcon from '@fastgpt/web/components/common/Icon';
@ -61,13 +64,25 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => {
label: t('common:core.dataset.collection.metadata.Updatetime'),
value: formatTime2YMDHM(collection.updateTime)
},
{
label: t('dataset:collection_metadata_custom_pdf_parse'),
value: collection.customPdfParse ? 'Yes' : 'No'
},
{
label: t('common:core.dataset.collection.metadata.Raw text length'),
value: collection.rawTextLength ?? '-'
},
{
label: t('dataset:collection.Training type'),
value: t(TrainingTypeMap[collection.trainingType]?.label as any)
label: t('dataset:collection_metadata_image_parse'),
value: collection.imageIndex ? 'Yes' : 'No'
},
{
label: t('dataset:auto_indexes'),
value: collection.autoIndexes ? 'Yes' : 'No'
},
{
label: t('dataset:collection.training_type'),
value: t(DatasetCollectionDataProcessModeMap[collection.trainingType]?.label as any)
},
{
label: t('common:core.dataset.collection.metadata.Chunk Size'),
@ -99,8 +114,8 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => {
<Box fontSize={'md'} pb={4}>
{t('common:core.dataset.collection.metadata.metadata')}
</Box>
<Flex mb={4} wordBreak={'break-all'} fontSize={'sm'}>
<Box color={'myGray.500'} flex={'0 0 70px'}>
<Flex mb={3} wordBreak={'break-all'} fontSize={'sm'}>
<Box color={'myGray.500'} flex={'0 0 90px'}>
{t('common:core.dataset.collection.id')}:
</Box>
<Box>{collection?._id}</Box>
@ -109,8 +124,8 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => {
(item, i) =>
item.label &&
item.value && (
<Flex key={i} alignItems={'center'} mb={4} wordBreak={'break-all'} fontSize={'sm'}>
<Box color={'myGray.500'} flex={'0 0 70px'}>
<Flex key={i} alignItems={'center'} mb={3} wordBreak={'break-all'} fontSize={'sm'}>
<Box color={'myGray.500'} flex={'0 0 90px'}>
{item.label}
</Box>
<Box>{item.value}</Box>

View File

@ -2,7 +2,6 @@ import React, { useMemo } from 'react';
import { Box, Flex, Button, ModalFooter, ModalBody, Input, HStack } from '@chakra-ui/react';
import { useSelectFile } from '@/web/common/file/hooks/useSelectFile';
import { useForm } from 'react-hook-form';
import { useToast } from '@fastgpt/web/hooks/useToast';
import { useRouter } from 'next/router';
import { useSystemStore } from '@/web/common/system/useSystemStore';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
@ -41,7 +40,8 @@ const CreateModal = ({
}) => {
const { t } = useTranslation();
const router = useRouter();
const { defaultModels, embeddingModelList, datasetModelList } = useSystemStore();
const { feConfigs, defaultModels, embeddingModelList, datasetModelList, getVllmModelList } =
useSystemStore();
const { isPc } = useSystem();
const datasetTypeMap = useMemo(() => {
@ -71,6 +71,8 @@ const CreateModal = ({
const filterNotHiddenVectorModelList = embeddingModelList.filter((item) => !item.hidden);
const vllmModelList = useMemo(() => getVllmModelList(), [getVllmModelList]);
const form = useForm<CreateDatasetParams>({
defaultValues: {
parentId,
@ -81,13 +83,15 @@ const CreateModal = ({
vectorModel:
defaultModels.embedding?.model || getWebDefaultEmbeddingModel(embeddingModelList)?.model,
agentModel:
defaultModels.datasetTextLLM?.model || getWebDefaultLLMModel(datasetModelList)?.model
defaultModels.datasetTextLLM?.model || getWebDefaultLLMModel(datasetModelList)?.model,
vlmModel: defaultModels.datasetImageLLM?.model
}
});
const { register, setValue, handleSubmit, watch } = form;
const avatar = watch('avatar');
const vectorModel = watch('vectorModel');
const agentModel = watch('agentModel');
const vlmModel = watch('vlmModel');
const {
File,
@ -174,6 +178,7 @@ const CreateModal = ({
/>
</Flex>
</Box>
<Flex
mt={6}
alignItems={['flex-start', 'center']}
@ -206,6 +211,7 @@ const CreateModal = ({
/>
</Box>
</Flex>
<Flex
mt={6}
alignItems={['flex-start', 'center']}
@ -232,11 +238,45 @@ const CreateModal = ({
value: item.model
}))}
onchange={(e) => {
setValue('agentModel' as const, e);
setValue('agentModel', e);
}}
/>
</Box>
</Flex>
{feConfigs?.isPlus && (
<Flex
mt={6}
alignItems={['flex-start', 'center']}
justify={'space-between'}
flexDir={['column', 'row']}
>
<HStack
spacing={1}
flex={['', '0 0 110px']}
fontSize={'sm'}
color={'myGray.900'}
fontWeight={500}
pb={['12px', '0']}
>
<Box>{t('dataset:vllm_model')}</Box>
</HStack>
<Box w={['100%', '300px']}>
<AIModelSelector
w={['100%', '300px']}
value={vlmModel}
list={vllmModelList.map((item) => ({
label: item.name,
value: item.model
}))}
onchange={(e) => {
setValue('vlmModel', e);
}}
/>
</Box>
</Flex>
)}
{/* @ts-ignore */}
<ApiDatasetForm type={type} form={form} />
</ModalBody>

View File

@ -0,0 +1,65 @@
import { NextAPI } from '@/service/middleware/entry';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { NextApiRequest, NextApiResponse } from 'next';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import { DatasetCollectionDataProcessModeEnum } from '@fastgpt/global/core/dataset/constants';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
// 所有 trainingType=auto 的 collection都改成 trainingType=chunk
const updateCollections = async () => {
await MongoDatasetCollection.updateMany(
{
trainingType: DatasetCollectionDataProcessModeEnum.auto
},
{
$set: {
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
autoIndexes: true
}
}
);
};
const updateData = async () => {
await MongoDatasetData.updateMany({ indexes: { $exists: true } }, [
{
$set: {
indexes: {
$map: {
input: '$indexes',
as: 'index',
in: {
$mergeObjects: [
'$$index',
{
type: {
$cond: {
if: { $eq: ['$$index.defaultIndex', true] },
then: DatasetDataIndexTypeEnum.default,
else: DatasetDataIndexTypeEnum.custom
}
}
}
]
}
}
}
}
}
]);
};
async function handler(req: NextApiRequest, _res: NextApiResponse) {
await authCert({ req, authRoot: true });
console.log('变更所有 collection 的 trainingType 为 chunk');
await updateCollections();
console.log(
"更新所有 data 的 index, autoIndex=true 的增加type='default',其他的增加 type='custom'"
);
await updateData();
return { success: true };
}
export default NextAPI(handler);

View File

@ -1,78 +0,0 @@
/*
Read db file content and response 3000 words
*/
import type { NextApiResponse } from 'next';
import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file';
import { NextAPI } from '@/service/middleware/entry';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
import { ApiRequestProps } from '@fastgpt/service/type/next';
import {
OwnerPermissionVal,
WritePermissionVal
} from '@fastgpt/global/support/permission/constant';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
export type PreviewContextProps = {
datasetId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
isQAImport?: boolean;
selector?: string;
externalFileId?: string;
};
async function handler(req: ApiRequestProps<PreviewContextProps>, res: NextApiResponse<any>) {
const { type, sourceId, isQAImport, selector, datasetId, externalFileId } = req.body;
if (!sourceId) {
throw new Error('fileId is empty');
}
const { teamId, apiServer, feishuServer, yuqueServer } = await (async () => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const res = await authCollectionFile({
req,
authToken: true,
authApiKey: true,
fileId: sourceId,
per: OwnerPermissionVal
});
return {
teamId: res.teamId
};
}
const { dataset } = await authDataset({
req,
authApiKey: true,
authToken: true,
datasetId,
per: WritePermissionVal
});
return {
teamId: dataset.teamId,
apiServer: dataset.apiServer,
feishuServer: dataset.feishuServer,
yuqueServer: dataset.yuqueServer
};
})();
const rawText = await readDatasetSourceRawText({
teamId,
type,
sourceId,
isQAImport,
selector,
apiServer,
feishuServer,
yuqueServer,
externalFileId
});
return {
previewContent: rawText.slice(0, 3000),
totalLength: rawText.length
};
}
export default NextAPI(handler);

View File

@ -4,7 +4,8 @@ import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import {
TrainingModeEnum,
DatasetCollectionTypeEnum
DatasetCollectionTypeEnum,
DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { NextAPI } from '@/service/middleware/entry';
@ -15,15 +16,7 @@ import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
async function handler(req: NextApiRequest): CreateCollectionResponse {
const {
name,
apiFileId,
trainingType = TrainingModeEnum.chunk,
chunkSize = 512,
chunkSplitter,
qaPrompt,
...body
} = req.body as ApiDatasetCreateDatasetCollectionParams;
const { name, apiFileId, ...body } = req.body as ApiDatasetCreateDatasetCollectionParams;
const { teamId, tmbId, dataset } = await authDataset({
req,
@ -56,7 +49,8 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
feishuServer,
yuqueServer,
apiFileId,
teamId
teamId,
tmbId
});
const { collectionId, insertResults } = await createCollectionAndInsertData({
@ -69,10 +63,6 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
tmbId,
type: DatasetCollectionTypeEnum.apiFile,
name: name,
trainingType,
chunkSize,
chunkSplitter,
qaPrompt,
apiFileId,
metadata: {
relatedImgId: apiFileId

View File

@ -4,6 +4,7 @@ import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { FileIdCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api';
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import {
DatasetCollectionDataProcessModeEnum,
DatasetCollectionTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants';
@ -15,7 +16,6 @@ import { MongoRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/schem
async function handler(req: NextApiRequest): CreateCollectionResponse {
const { datasetId, parentId, fileId, ...body } = req.body as FileIdCreateDatasetCollectionParams;
const trainingType = TrainingModeEnum.chunk;
const { teamId, tmbId, dataset } = await authDataset({
req,
authToken: true,
@ -27,6 +27,7 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
// 1. read file
const { rawText, filename } = await readFileContentFromMongo({
teamId,
tmbId,
bucketName: BucketNameEnum.dataset,
fileId,
isQAImport: true
@ -47,7 +48,7 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
fileId,
// special metadata
trainingType,
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
chunkSize: 0
}
});

View File

@ -2,12 +2,8 @@ import { readFileContentFromMongo } from '@fastgpt/service/common/file/gridfs/co
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { FileIdCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api';
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import {
DatasetCollectionTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { MongoRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/schema';
import { NextAPI } from '@/service/middleware/entry';
import { ApiRequestProps } from '@fastgpt/service/type/next';
@ -17,14 +13,7 @@ import { CreateCollectionResponse } from '@/global/core/dataset/api';
async function handler(
req: ApiRequestProps<FileIdCreateDatasetCollectionParams>
): CreateCollectionResponse {
const {
fileId,
trainingType = TrainingModeEnum.chunk,
chunkSize = 512,
chunkSplitter,
qaPrompt,
...body
} = req.body;
const { fileId, customPdfParse, ...body } = req.body;
const { teamId, tmbId, dataset } = await authDataset({
req,
@ -37,8 +26,10 @@ async function handler(
// 1. read file
const { rawText, filename } = await readFileContentFromMongo({
teamId,
tmbId,
bucketName: BucketNameEnum.dataset,
fileId
fileId,
customPdfParse
});
const { collectionId, insertResults } = await createCollectionAndInsertData({
@ -54,12 +45,7 @@ async function handler(
metadata: {
relatedImgId: fileId
},
// special metadata
trainingType,
chunkSize,
chunkSplitter,
qaPrompt
customPdfParse
},
relatedId: fileId

View File

@ -13,14 +13,7 @@ import { urlsFetch } from '@fastgpt/service/common/string/cheerio';
import { hashStr } from '@fastgpt/global/common/string/tools';
async function handler(req: NextApiRequest): CreateCollectionResponse {
const {
link,
trainingType = TrainingModeEnum.chunk,
chunkSize = 512,
chunkSplitter,
qaPrompt,
...body
} = req.body as LinkCreateDatasetCollectionParams;
const { link, ...body } = req.body as LinkCreateDatasetCollectionParams;
const { teamId, tmbId, dataset } = await authDataset({
req,
@ -53,12 +46,6 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
relatedImgId: link,
webPageSelector: body?.metadata?.webPageSelector
},
trainingType,
chunkSize,
chunkSplitter,
qaPrompt,
rawLink: link
},

View File

@ -6,7 +6,7 @@ import { FileCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/
import { removeFilesByPaths } from '@fastgpt/service/common/file/utils';
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { getNanoid, hashStr } from '@fastgpt/global/common/string/tools';
import { getNanoid } from '@fastgpt/global/common/string/tools';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { readRawTextByLocalFile } from '@fastgpt/service/common/file/read/utils';
import { NextAPI } from '@/service/middleware/entry';
@ -48,8 +48,10 @@ async function handler(req: NextApiRequest, res: NextApiResponse<any>): CreateCo
// 1. read file
const { rawText } = await readRawTextByLocalFile({
teamId,
tmbId,
path: file.path,
encoding: file.encoding,
customPdfParse: collectionData.customPdfParse,
metadata: {
...fileMetadata,
relatedId: relatedImgId

View File

@ -24,20 +24,14 @@ type RetrainingCollectionResponse = {
async function handler(
req: ApiRequestProps<reTrainingDatasetFileCollectionParams>
): Promise<RetrainingCollectionResponse> {
const {
collectionId,
trainingType = TrainingModeEnum.chunk,
chunkSize = 512,
chunkSplitter,
qaPrompt
} = req.body;
const { collectionId, customPdfParse, ...data } = req.body;
if (!collectionId) {
return Promise.reject(CommonErrEnum.missingParams);
}
// 凭证校验
const { collection } = await authDatasetCollection({
const { collection, teamId, tmbId } = await authDatasetCollection({
req,
authToken: true,
authApiKey: true,
@ -84,7 +78,9 @@ async function handler(
})();
const rawText = await readDatasetSourceRawText({
teamId: collection.teamId,
teamId,
tmbId,
customPdfParse,
...sourceReadType
});
@ -100,12 +96,15 @@ async function handler(
dataset: collection.dataset,
rawText,
createCollectionParams: {
...data,
teamId: collection.teamId,
tmbId: collection.tmbId,
datasetId: collection.dataset._id,
name: collection.name,
type: collection.type,
customPdfParse,
fileId: collection.fileId,
rawLink: collection.rawLink,
externalFileId: collection.externalFileId,
@ -121,10 +120,6 @@ async function handler(
parentId: collection.parentId,
// special metadata
trainingType,
chunkSize,
chunkSplitter,
qaPrompt,
metadata: collection.metadata
}
});

View File

@ -2,25 +2,13 @@ import type { NextApiRequest } from 'next';
import type { TextCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import {
TrainingModeEnum,
DatasetCollectionTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { NextAPI } from '@/service/middleware/entry';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
import { CreateCollectionResponse } from '@/global/core/dataset/api';
async function handler(req: NextApiRequest): CreateCollectionResponse {
const {
name,
text,
trainingType = TrainingModeEnum.chunk,
chunkSize = 512,
chunkSplitter,
qaPrompt,
...body
} = req.body as TextCreateDatasetCollectionParams;
const { name, text, ...body } = req.body as TextCreateDatasetCollectionParams;
const { teamId, tmbId, dataset } = await authDataset({
req,
@ -39,11 +27,7 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
tmbId,
type: DatasetCollectionTypeEnum.virtual,
name,
trainingType,
chunkSize,
chunkSplitter,
qaPrompt
name
}
});

View File

@ -6,12 +6,12 @@ import {
getLLMModel,
getEmbeddingModel,
getDatasetModel,
getDefaultEmbeddingModel
getDefaultEmbeddingModel,
getVlmModel
} from '@fastgpt/service/core/ai/model';
import { checkTeamDatasetLimit } from '@fastgpt/service/support/permission/teamLimit';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
import { NextAPI } from '@/service/middleware/entry';
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
import type { ApiRequestProps } from '@fastgpt/service/type/next';
import { parseParentIdInMongo } from '@fastgpt/global/common/parentFolder/utils';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
@ -32,8 +32,9 @@ async function handler(
intro,
type = DatasetTypeEnum.dataset,
avatar,
vectorModel = getDefaultEmbeddingModel().model,
agentModel = getDatasetModel().model,
vectorModel = getDefaultEmbeddingModel()?.model,
agentModel = getDatasetModel()?.model,
vlmModel,
apiServer,
feishuServer,
yuqueServer
@ -63,8 +64,11 @@ async function handler(
// check model valid
const vectorModelStore = getEmbeddingModel(vectorModel);
const agentModelStore = getLLMModel(agentModel);
if (!vectorModelStore || !agentModelStore) {
return Promise.reject(DatasetErrEnum.invalidVectorModelOrQAModel);
if (!vectorModelStore) {
return Promise.reject(`System not embedding model`);
}
if (!agentModelStore) {
return Promise.reject(`System not llm model`);
}
// check limit
@ -81,6 +85,7 @@ async function handler(
tmbId,
vectorModel,
agentModel,
vlmModel,
avatar,
type,
apiServer,

View File

@ -7,9 +7,13 @@ import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller';
import { NextAPI } from '@/service/middleware/entry';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
import { getTrainingModeByCollection } from '@fastgpt/service/core/dataset/collection/utils';
async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
const body = req.body as PushDatasetDataProps;
// Adapter 4.9.0
body.trainingType = body.trainingType || body.trainingMode;
const { collectionId, data } = body;
if (!collectionId || !Array.isArray(data)) {
@ -32,7 +36,7 @@ async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
// auth dataset limit
await checkDatasetLimit({
teamId,
insertLen: predictDataLimitLength(collection.trainingType, data)
insertLen: predictDataLimitLength(getTrainingModeByCollection(collection), data)
});
return pushDataListToTrainingQueue({
@ -40,8 +44,9 @@ async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
teamId,
tmbId,
datasetId: collection.datasetId,
vectorModel: collection.dataset.vectorModel,
agentModel: collection.dataset.agentModel,
vectorModel: collection.dataset.vectorModel
vlmModel: collection.dataset.vlmModel
});
}

View File

@ -1,4 +1,4 @@
import { getLLMModel, getEmbeddingModel } from '@fastgpt/service/core/ai/model';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '@fastgpt/service/core/ai/model';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { ReadPermissionVal } from '@fastgpt/global/support/permission/constant';
import { NextAPI } from '@/service/middleware/entry';
@ -51,7 +51,8 @@ async function handler(req: ApiRequestProps<Query>): Promise<DatasetItemType> {
: undefined,
permission,
vectorModel: getEmbeddingModel(dataset.vectorModel),
agentModel: getLLMModel(dataset.agentModel)
agentModel: getLLMModel(dataset.agentModel),
vlmModel: getVlmModel(dataset.vlmModel)
};
}

View File

@ -17,6 +17,7 @@ export type PostPreviewFilesChunksProps = {
chunkSize: number;
overlapRatio: number;
customSplitChar?: string;
customPdfParse?: boolean;
// Read params
selector?: string;
@ -40,7 +41,8 @@ async function handler(
selector,
isQAImport,
datasetId,
externalFileId
externalFileId,
customPdfParse = false
} = req.body;
if (!sourceId) {
@ -50,7 +52,7 @@ async function handler(
throw new Error('chunkSize is too large, should be less than 30000');
}
const { teamId, apiServer, feishuServer, yuqueServer } = await (async () => {
const { teamId, tmbId, apiServer, feishuServer, yuqueServer } = await (async () => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const res = await authCollectionFile({
req,
@ -60,10 +62,11 @@ async function handler(
per: OwnerPermissionVal
});
return {
teamId: res.teamId
teamId: res.teamId,
tmbId: res.tmbId
};
}
const { dataset } = await authDataset({
const { dataset, teamId, tmbId } = await authDataset({
req,
authApiKey: true,
authToken: true,
@ -71,7 +74,8 @@ async function handler(
per: WritePermissionVal
});
return {
teamId: dataset.teamId,
teamId,
tmbId,
apiServer: dataset.apiServer,
feishuServer: dataset.feishuServer,
yuqueServer: dataset.yuqueServer
@ -80,6 +84,7 @@ async function handler(
const rawText = await readDatasetSourceRawText({
teamId,
tmbId,
type,
sourceId,
selector,
@ -87,7 +92,8 @@ async function handler(
apiServer,
feishuServer,
yuqueServer,
externalFileId
externalFileId,
customPdfParse
});
return rawText2Chunks({
@ -96,6 +102,6 @@ async function handler(
overlapRatio,
customReg: customSplitChar ? [customSplitChar] : [],
isQAImport: isQAImport
}).slice(0, 15);
}).slice(0, 10);
}
export default NextAPI(handler);

View File

@ -6,7 +6,7 @@ import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
import { getLLMModel, getEmbeddingModel } from '@fastgpt/service/core/ai/model';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '@fastgpt/service/core/ai/model';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { ApiRequestProps } from '@fastgpt/service/type/next';
import { OwnerPermissionVal } from '@fastgpt/global/support/permission/constant';
@ -50,7 +50,8 @@ async function handler(req: ApiRequestProps<rebuildEmbeddingBody>): Promise<Resp
appName: '切换索引模型',
billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name
agentModel: getLLMModel(dataset.agentModel)?.name,
vllmModel: getVlmModel(dataset.vlmModel)?.name
});
// update vector model and dataset.data rebuild field

View File

@ -56,6 +56,7 @@ async function handler(
avatar,
intro,
agentModel,
vlmModel,
websiteConfig,
externalReadUrl,
apiServer,
@ -109,7 +110,7 @@ async function handler(
updateTraining({
teamId: dataset.teamId,
datasetId: id,
agentModel: agentModel?.model
agentModel
});
const onUpdate = async (session: ClientSession) => {
@ -119,7 +120,8 @@ async function handler(
...parseParentIdInMongo(parentId),
...(name && { name }),
...(avatar && { avatar }),
...(agentModel && { agentModel: agentModel.model }),
...(agentModel && { agentModel }),
...(vlmModel && { vlmModel }),
...(websiteConfig && { websiteConfig }),
...(status && { status }),
...(intro !== undefined && { intro }),
@ -212,7 +214,7 @@ const updateTraining = async ({
$set: {
model: agentModel,
retryCount: 5,
lockTime: new Date()
lockTime: new Date('2000/1/1')
}
}
);

View File

@ -1,7 +1,7 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import type { NextApiRequest } from 'next';
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
import { CreateTrainingUsageProps } from '@fastgpt/global/support/wallet/usage/api.d';
import { getLLMModel, getEmbeddingModel } from '@fastgpt/service/core/ai/model';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '@fastgpt/service/core/ai/model';
import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
@ -24,7 +24,8 @@ async function handler(req: NextApiRequest) {
appName: name,
billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel).name,
agentModel: getLLMModel(dataset.agentModel).name
agentModel: getLLMModel(dataset.agentModel).name,
vllmModel: getVlmModel(dataset.vlmModel)?.name
});
return billId;

View File

@ -8,12 +8,41 @@ import { insertDatasetDataVector } from '@fastgpt/service/common/vectorStore/con
import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils';
import { jiebaSplit } from '@fastgpt/service/common/string/jieba';
import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
import { DatasetDataItemType } from '@fastgpt/global/core/dataset/type';
import { DatasetDataIndexItemType, DatasetDataItemType } from '@fastgpt/global/core/dataset/type';
import { getEmbeddingModel } from '@fastgpt/service/core/ai/model';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { ClientSession } from '@fastgpt/service/common/mongo';
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
const formatIndexes = ({
indexes,
q,
a = ''
}: {
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
q: string;
a?: string;
}) => {
indexes = indexes || [];
const defaultIndex = getDefaultIndex({ q, a });
// 1. Reset default index
indexes = indexes.filter((item) => item.type !== DatasetDataIndexTypeEnum.default);
// 2. Add default index
indexes.unshift(...defaultIndex);
// 3. Filter same text
indexes = indexes.filter(
(item, index, self) =>
!!item.text.trim() && index === self.findIndex((t) => t.text === item.text)
);
return indexes.map((index) => ({
type: index.type,
text: index.text,
dataId: index.dataId
}));
};
/* insert data.
* 1. create data id
* 2. insert pg
@ -41,42 +70,28 @@ export async function insertData2Dataset({
return Promise.reject("teamId and tmbId can't be the same");
}
const qaStr = getDefaultIndex({ q, a }).text;
// 1. Get vector indexes and insert
// Empty indexes check, if empty, create default index
indexes =
Array.isArray(indexes) && indexes.length > 0
? indexes.map((index) => ({
text: index.text,
dataId: undefined,
defaultIndex: index.text.trim() === qaStr
}))
: [getDefaultIndex({ q, a })];
if (!indexes.find((index) => index.defaultIndex)) {
indexes.unshift(getDefaultIndex({ q, a }));
} else if (q && a && !indexes.find((index) => index.text === q)) {
// push a q index
indexes.push({
defaultIndex: false,
text: q
});
}
indexes = indexes.slice(0, 6);
const newIndexes = formatIndexes({ indexes, q, a });
// insert to vector store
const result = await Promise.all(
indexes.map((item) =>
insertDatasetDataVector({
newIndexes.map(async (item) => {
const result = await insertDatasetDataVector({
query: item.text,
model: getEmbeddingModel(model),
teamId,
datasetId,
collectionId
})
)
});
return {
tokens: result.tokens,
index: {
...item,
dataId: result.insertId
}
};
})
);
// 2. Create mongo data
@ -89,13 +104,8 @@ export async function insertData2Dataset({
collectionId,
q,
a,
// FullText tmp
// fullTextToken: jiebaSplit({ text: qaStr }),
chunkIndex,
indexes: indexes?.map((item, i) => ({
...item,
dataId: result[i].insertId
}))
indexes: result.map((item) => item.index)
}
],
{ session, ordered: true }
@ -109,7 +119,7 @@ export async function insertData2Dataset({
datasetId,
collectionId,
dataId: _id,
fullTextToken: jiebaSplit({ text: qaStr })
fullTextToken: jiebaSplit({ text: `${q}\n${a}`.trim() })
}
],
{ session, ordered: true }
@ -122,7 +132,7 @@ export async function insertData2Dataset({
}
/**
* update data
* Update data(indexes overwrite)
* 1. compare indexes
* 2. insert new pg data
* session run:
@ -139,30 +149,19 @@ export async function updateData2Dataset({
if (!Array.isArray(indexes)) {
return Promise.reject('indexes is required');
}
const qaStr = getDefaultIndex({ q, a }).text;
// patch index and update pg
// 1. Get mongo data
const mongoData = await MongoDatasetData.findById(dataId);
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
// remove defaultIndex
let formatIndexes = indexes.map((index) => ({
...index,
text: index.text.trim(),
defaultIndex: index.text.trim() === qaStr
}));
if (!formatIndexes.find((index) => index.defaultIndex)) {
const defaultIndex = mongoData.indexes.find((index) => index.defaultIndex);
formatIndexes.unshift(defaultIndex ? defaultIndex : getDefaultIndex({ q, a }));
}
formatIndexes = formatIndexes.slice(0, 6);
// 2. Compute indexes
const formatIndexesResult = formatIndexes({ indexes, q, a });
// patch indexes, create, update, delete
// 3. Patch indexes, create, update, delete
const patchResult: PatchIndexesProps[] = [];
// find database indexes in new Indexes, if have not, delete it
for (const item of mongoData.indexes) {
const index = formatIndexes.find((index) => index.dataId === item.dataId);
const index = formatIndexesResult.find((index) => index.dataId === item.dataId);
if (!index) {
patchResult.push({
type: 'delete',
@ -170,53 +169,48 @@ export async function updateData2Dataset({
});
}
}
for (const item of formatIndexes) {
const index = mongoData.indexes.find((index) => index.dataId === item.dataId);
// in database, update
if (index) {
// default index update
if (index.defaultIndex && index.text !== qaStr) {
patchResult.push({
type: 'update',
index: {
//@ts-ignore
...index.toObject(),
text: qaStr
}
});
continue;
}
// custom index update
if (index.text !== item.text) {
patchResult.push({
type: 'update',
index: item
});
continue;
}
patchResult.push({
type: 'unChange',
index: item
});
} else {
// not in database, create
for (const item of formatIndexesResult) {
if (!item.dataId) {
patchResult.push({
type: 'create',
index: item
});
} else {
const index = mongoData.indexes.find((index) => index.dataId === item.dataId);
if (!index) continue;
// Not change
if (index.text === item.text) {
patchResult.push({
type: 'unChange',
index: {
...item,
dataId: index.dataId
}
});
} else {
// index Update
patchResult.push({
type: 'update',
index: {
...item,
dataId: index.dataId
}
});
}
}
}
// update mongo updateTime
// 4. Update mongo updateTime(便于脏数据检查器识别)
mongoData.updateTime = new Date();
await mongoData.save();
// insert vector
const clonePatchResult2Insert: PatchIndexesProps[] = JSON.parse(JSON.stringify(patchResult));
// 5. Insert vector
const insertResult = await Promise.all(
clonePatchResult2Insert.map(async (item) => {
// insert new vector and update dateId
if (item.type === 'create' || item.type === 'update') {
patchResult
.filter((item) => item.type === 'create' || item.type === 'update')
.map(async (item) => {
// insert new vector and update dateId
const result = await insertDatasetDataVector({
query: item.index.text,
model: getEmbeddingModel(model),
@ -225,26 +219,22 @@ export async function updateData2Dataset({
collectionId: mongoData.collectionId
});
item.index.dataId = result.insertId;
return result;
}
return {
tokens: 0
};
})
return {
tokens: result.tokens
};
})
);
const tokens = insertResult.reduce((acc, cur) => acc + cur.tokens, 0);
const newIndexes = patchResult
.filter((item) => item.type !== 'delete')
.map((item) => item.index) as DatasetDataIndexItemType[];
console.log(newIndexes, '---');
// console.log(clonePatchResult2Insert);
await mongoSessionRun(async (session) => {
// update mongo
const newIndexes = clonePatchResult2Insert
.filter((item) => item.type !== 'delete')
.map((item) => item.index);
// update mongo other data
// Update MongoData
mongoData.q = q || mongoData.q;
mongoData.a = a ?? mongoData.a;
// FullText tmp
// mongoData.fullTextToken = jiebaSplit({ text: `${mongoData.q}\n${mongoData.a}`.trim() });
// @ts-ignore
mongoData.indexes = newIndexes;
await mongoData.save({ session });
@ -255,15 +245,15 @@ export async function updateData2Dataset({
{ session }
);
// delete vector
// Delete vector
const deleteIdList = patchResult
.filter((item) => item.type === 'delete' || item.type === 'update')
.map((item) => item.index.dataId)
.filter(Boolean);
.filter(Boolean) as string[];
if (deleteIdList.length > 0) {
await deleteDatasetDataVector({
teamId: mongoData.teamId,
idList: deleteIdList as string[]
idList: deleteIdList
});
}
});

View File

@ -142,7 +142,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
teamId: data.teamId,
tmbId: data.tmbId,
collectionId: data.collectionId,
trainingMode: TrainingModeEnum.chunk,
mode: TrainingModeEnum.chunk,
data: qaArr.map((item) => ({
...item,
chunkIndex: data.chunkIndex
@ -179,9 +179,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
}
}
/**
*
*/
// Format qa answer
function formatSplitText(text: string, rawText: string) {
text = text.replace(/\\n/g, '\n'); // 将换行符替换为空格
const regex = /Q\d+:(\s*)(.*)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q\d|$)/g; // 匹配Q和A的正则表达式
@ -194,13 +192,7 @@ function formatSplitText(text: string, rawText: string) {
if (q) {
result.push({
q,
a,
indexes: [
{
defaultIndex: true,
text: `${q}\n${a.trim().replace(/\n\s*/g, '\n')}`
}
]
a
});
}
}
@ -211,13 +203,7 @@ function formatSplitText(text: string, rawText: string) {
chunks.forEach((chunk) => {
result.push({
q: chunk,
a: '',
indexes: [
{
defaultIndex: true,
text: chunk
}
]
a: ''
});
});
}

View File

@ -20,6 +20,16 @@ const reduceQueue = () => {
return global.vectorQueueLen === 0;
};
const reduceQueueAndReturn = (delay = 0) => {
reduceQueue();
if (delay) {
setTimeout(() => {
generateVector();
}, delay);
} else {
generateVector();
}
};
/* 索引生成队列。每导入一次,就是一个单独的线程 */
export async function generateVector(): Promise<any> {
@ -45,20 +55,7 @@ export async function generateVector(): Promise<any> {
lockTime: new Date(),
$inc: { retryCount: -1 }
}
).select({
_id: 1,
teamId: 1,
tmbId: 1,
datasetId: 1,
collectionId: 1,
q: 1,
a: 1,
chunkIndex: 1,
dataId: 1,
indexes: 1,
model: 1,
billId: 1
});
);
// task preemption
if (!data) {
@ -85,14 +82,12 @@ export async function generateVector(): Promise<any> {
}
if (error) {
addLog.error(`[Vector Queue] Error`, { error });
reduceQueue();
return generateVector();
return reduceQueueAndReturn();
}
// auth balance
if (!(await checkTeamAiPointsAndLock(data.teamId))) {
reduceQueue();
return generateVector();
return reduceQueueAndReturn();
}
addLog.info(`[Vector Queue] Start`);
@ -119,15 +114,10 @@ export async function generateVector(): Promise<any> {
time: Date.now() - start
});
reduceQueue();
generateVector();
return reduceQueueAndReturn();
} catch (err: any) {
addLog.error(`[Vector Queue] Error`, err);
reduceQueue();
setTimeout(() => {
generateVector();
}, 1000);
return reduceQueueAndReturn(1000);
}
}

View File

@ -127,12 +127,12 @@ export const pushGenerateVectorUsage = ({
createUsage({
teamId,
tmbId,
appName: i18nT('common:support.wallet.moduleName.index'),
appName: i18nT('account_usage:embedding_index'),
totalPoints,
source,
list: [
{
moduleName: i18nT('common:support.wallet.moduleName.index'),
moduleName: i18nT('account_usage:embedding_index'),
amount: totalVector,
model: vectorModelName,
inputTokens
@ -203,7 +203,7 @@ export const pushQuestionGuideUsage = ({
});
};
export function pushAudioSpeechUsage({
export const pushAudioSpeechUsage = ({
appName = i18nT('common:support.wallet.usage.Audio Speech'),
model,
charsLength,
@ -217,7 +217,7 @@ export function pushAudioSpeechUsage({
teamId: string;
tmbId: string;
source: UsageSourceEnum;
}) {
}) => {
const { totalPoints, modelName } = formatModelChars2Points({
model,
inputTokens: charsLength,
@ -239,9 +239,9 @@ export function pushAudioSpeechUsage({
}
]
});
}
};
export function pushWhisperUsage({
export const pushWhisperUsage = ({
teamId,
tmbId,
duration
@ -249,7 +249,7 @@ export function pushWhisperUsage({
teamId: string;
tmbId: string;
duration: number;
}) {
}) => {
const whisperModel = getDefaultTTSModel();
if (!whisperModel) return;
@ -278,4 +278,4 @@ export function pushWhisperUsage({
}
]
});
}
};

View File

@ -1,4 +1,3 @@
import type { PreviewContextProps } from '@/pages/api/common/file/previewContent';
import { GET, POST } from '@/web/common/api/request';
import type { UploadImgProps } from '@fastgpt/global/common/file/api.d';
import { AxiosProgressEvent } from 'axios';
@ -19,11 +18,3 @@ export const postUploadFiles = (
'Content-Type': 'multipart/form-data; charset=utf-8'
}
});
export const getPreviewFileContent = (data: PreviewContextProps) =>
POST<{
previewContent: string;
totalLength: number;
}>('/common/file/previewContent', data, {
timeout: 600000
});

Some files were not shown because too many files have changed in this diff Show More