From 1dac2b70ecfe9fb0cf837e2bf923b5c1fcd926dd Mon Sep 17 00:00:00 2001
From: Archer <545436317@qq.com>
Date: Tue, 20 May 2025 09:59:24 +0800
Subject: [PATCH] perf: stream timeout;feat: hnsw max_scan_tuples config;fix:
 fulltext search merge error (#4838)

* perf: stream timeout

* feat: hnsw max_scan_tuples config

* fix: fulltext search merge error

* perf: jieba code
---
 .../zh-cn/docs/development/upgrading/4910.md  | 21 +++++
 .../global/common/system/types/index.d.ts     |  4 +-
 packages/service/common/string/jieba/index.ts |  4 +-
 packages/service/common/vectorDB/pg/index.ts  |  3 +-
 packages/service/core/ai/config.ts            |  2 +-
 .../core/dataset/data/dataTextSchema.ts       |  4 +-
 .../service/core/dataset/search/controller.ts | 90 +++++++++----------
 .../core/workflow/dispatch/code/run.ts        |  2 -
 projects/app/data/config.json                 |  1 +
 .../pageComponents/dataset/detail/Test.tsx    |  1 -
 10 files changed, 74 insertions(+), 58 deletions(-)
 create mode 100644 docSite/content/zh-cn/docs/development/upgrading/4910.md

diff --git a/docSite/content/zh-cn/docs/development/upgrading/4910.md b/docSite/content/zh-cn/docs/development/upgrading/4910.md
new file mode 100644
index 000000000..b0602f440
--- /dev/null
+++ b/docSite/content/zh-cn/docs/development/upgrading/4910.md
@@ -0,0 +1,21 @@
+---
+title: 'V4.9.10(进行中)'
+description: 'FastGPT V4.9.10 更新说明'
+icon: 'upgrade'
+draft: false
+toc: true
+weight: 790
+---
+
+
+## 🚀 新增内容
+
+1. 支持 PG 设置`systemEnv.hnswMaxScanTuples`参数，提高迭代搜索的数据总量。
+
+## ⚙️ 优化
+
+1. LLM stream调用，默认超时调大。
+
+## 🐛 修复
+
+1. 全文检索多知识库时排序得分排序不正确
\ No newline at end of file
diff --git a/packages/global/common/system/types/index.d.ts b/packages/global/common/system/types/index.d.ts
index f3b6d33a6..f6f6c183d 100644
--- a/packages/global/common/system/types/index.d.ts
+++ b/packages/global/common/system/types/index.d.ts
@@ -130,9 +130,11 @@ export type SystemEnvType = {
   vectorMaxProcess: number;
   qaMaxProcess: number;
   vlmMaxProcess: number;
-  hnswEfSearch: number;
   tokenWorkers: number; // token count max worker
 
+  hnswEfSearch: number;
+  hnswMaxScanTuples: number;
+
   oneapiUrl?: string;
   chatApiKey?: string;
 
diff --git a/packages/service/common/string/jieba/index.ts b/packages/service/common/string/jieba/index.ts
index e6c376303..8df8df95d 100644
--- a/packages/service/common/string/jieba/index.ts
+++ b/packages/service/common/string/jieba/index.ts
@@ -10,6 +10,7 @@ let jieba: Jieba | undefined;
 })();
 
 const stopWords = new Set([
+  '\n',
   '--',
   '?',
   '“',
@@ -1519,8 +1520,7 @@ const stopWords = new Set([
 ]);
 
 export async function jiebaSplit({ text }: { text: string }) {
-  text = text.replace(/[#*`_~>[\](){}|]/g, '').replace(/\S*https?\S*/gi, '');
-
+  text = text.replace(/[#*`_~>[\](){}|]|\S*https?\S*/g, '').trim();
   const tokens = (await jieba!.cutAsync(text, true)) as string[];
 
   return (
diff --git a/packages/service/common/vectorDB/pg/index.ts b/packages/service/common/vectorDB/pg/index.ts
index 3a7cc3f9a..fbf268868 100644
--- a/packages/service/common/vectorDB/pg/index.ts
+++ b/packages/service/common/vectorDB/pg/index.ts
@@ -188,6 +188,7 @@ export class PgVectorCtrl {
       const results: any = await PgClient.query(
         `BEGIN;
           SET LOCAL hnsw.ef_search = ${global.systemEnv?.hnswEfSearch || 100};
+          SET LOCAL hnsw.max_scan_tuples = ${global.systemEnv?.hnswMaxScanTuples || 100000};
           SET LOCAL hnsw.iterative_scan = relaxed_order;
           WITH relaxed_results AS MATERIALIZED (
             select id, collection_id, vector <#> '[${vector}]' AS score
@@ -199,7 +200,7 @@ export class PgVectorCtrl {
           ) SELECT id, collection_id, score FROM relaxed_results ORDER BY score;
         COMMIT;`
       );
-      const rows = results?.[3]?.rows as PgSearchRawType[];
+      const rows = results?.[results.length - 2]?.rows as PgSearchRawType[];
 
       if (!Array.isArray(rows)) {
         return {
diff --git a/packages/service/core/ai/config.ts b/packages/service/core/ai/config.ts
index 8a2e45c7c..3b7500c55 100644
--- a/packages/service/core/ai/config.ts
+++ b/packages/service/core/ai/config.ts
@@ -78,7 +78,7 @@ export const createChatCompletion = async ({
     }
     body.model = modelConstantsData.model;
 
-    const formatTimeout = timeout ? timeout : body.stream ? 60000 : 600000;
+    const formatTimeout = timeout ? timeout : 600000;
     const ai = getAIApi({
       userKey,
       timeout: formatTimeout
diff --git a/packages/service/core/dataset/data/dataTextSchema.ts b/packages/service/core/dataset/data/dataTextSchema.ts
index b26c1eaa5..340b9b0ea 100644
--- a/packages/service/core/dataset/data/dataTextSchema.ts
+++ b/packages/service/core/dataset/data/dataTextSchema.ts
@@ -34,9 +34,9 @@ const DatasetDataTextSchema = new Schema({
 
 try {
   DatasetDataTextSchema.index(
-    { teamId: 1, datasetId: 1, fullTextToken: 'text' },
+    { teamId: 1, fullTextToken: 'text' },
     {
-      name: 'teamId_1_datasetId_1_fullTextToken_text',
+      name: 'teamId_1_fullTextToken_text',
       default_language: 'none'
     }
   );
diff --git a/packages/service/core/dataset/search/controller.ts b/packages/service/core/dataset/search/controller.ts
index fbfddd9f4..798350585 100644
--- a/packages/service/core/dataset/search/controller.ts
+++ b/packages/service/core/dataset/search/controller.ts
@@ -544,56 +544,50 @@ export async function searchDatasetData(
       };
     }
 
-    const searchResults = (
-      await Promise.all(
-        datasetIds.map(async (id) => {
-          return MongoDatasetDataText.aggregate(
-            [
-              {
-                $match: {
-                  teamId: new Types.ObjectId(teamId),
-                  datasetId: new Types.ObjectId(id),
-                  $text: { $search: await jiebaSplit({ text: query }) },
-                  ...(filterCollectionIdList
-                    ? {
-                        collectionId: {
-                          $in: filterCollectionIdList.map((id) => new Types.ObjectId(id))
-                        }
-                      }
-                    : {}),
-                  ...(forbidCollectionIdList && forbidCollectionIdList.length > 0
-                    ? {
-                        collectionId: {
-                          $nin: forbidCollectionIdList.map((id) => new Types.ObjectId(id))
-                        }
-                      }
-                    : {})
+    const searchResults = (await MongoDatasetDataText.aggregate(
+      [
+        {
+          $match: {
+            teamId: new Types.ObjectId(teamId),
+            $text: { $search: await jiebaSplit({ text: query }) },
+            datasetId: { $in: datasetIds.map((id) => new Types.ObjectId(id)) },
+            ...(filterCollectionIdList
+              ? {
+                  collectionId: {
+                    $in: filterCollectionIdList.map((id) => new Types.ObjectId(id))
+                  }
                 }
-              },
-              {
-                $sort: {
-                  score: { $meta: 'textScore' }
+              : {}),
+            ...(forbidCollectionIdList && forbidCollectionIdList.length > 0
+              ? {
+                  collectionId: {
+                    $nin: forbidCollectionIdList.map((id) => new Types.ObjectId(id))
+                  }
                 }
-              },
-              {
-                $limit: limit
-              },
-              {
-                $project: {
-                  _id: 1,
-                  collectionId: 1,
-                  dataId: 1,
-                  score: { $meta: 'textScore' }
-                }
-              }
-            ],
-            {
-              ...readFromSecondary
-            }
-          );
-        })
-      )
-    ).flat() as (DatasetDataTextSchemaType & { score: number })[];
+              : {})
+          }
+        },
+        {
+          $sort: {
+            score: { $meta: 'textScore' }
+          }
+        },
+        {
+          $limit: limit
+        },
+        {
+          $project: {
+            _id: 1,
+            collectionId: 1,
+            dataId: 1,
+            score: { $meta: 'textScore' }
+          }
+        }
+      ],
+      {
+        ...readFromSecondary
+      }
+    )) as (DatasetDataTextSchemaType & { score: number })[];
 
     // Get data and collections
     const [dataList, collections] = await Promise.all([
diff --git a/packages/service/core/workflow/dispatch/code/run.ts b/packages/service/core/workflow/dispatch/code/run.ts
index 4396953a6..9fe554684 100644
--- a/packages/service/core/workflow/dispatch/code/run.ts
+++ b/packages/service/core/workflow/dispatch/code/run.ts
@@ -49,8 +49,6 @@ export const dispatchRunCode = async (props: RunCodeType): Promise<RunCodeRespon
       variables: customVariables
     });
 
-    console.log(runResult);
-
     if (runResult.success) {
       return {
         [NodeOutputKeyEnum.rawResponse]: runResult.data.codeReturn,
diff --git a/projects/app/data/config.json b/projects/app/data/config.json
index 612050b16..1d0303370 100644
--- a/projects/app/data/config.json
+++ b/projects/app/data/config.json
@@ -10,6 +10,7 @@
     "vlmMaxProcess": 10, // 图片理解模型最大处理进程
     "tokenWorkers": 30, // Token 计算线程保持数，会持续占用内存，不能设置太大。
     "hnswEfSearch": 100, // 向量搜索参数，仅对 PG 和 OB 生效。越大，搜索越精确，但是速度越慢。设置为100，有99%+精度。
+    "hnswMaxScanTuples": 100000, // 向量搜索最大扫描数据量，仅对 PG生效。
     "customPdfParse": {
       "url": "", // 自定义 PDF 解析服务地址
       "key": "", // 自定义 PDF 解析服务密钥
diff --git a/projects/app/src/pageComponents/dataset/detail/Test.tsx b/projects/app/src/pageComponents/dataset/detail/Test.tsx
index ddfe120d0..3ca1a9599 100644
--- a/projects/app/src/pageComponents/dataset/detail/Test.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Test.tsx
@@ -171,7 +171,6 @@ const Test = ({ datasetId }: { datasetId: string }) => {
           <Flex alignItems={'center'} justifyContent={'space-between'}>
             <MySelect<'text' | 'file'>
               size={'sm'}
-              w={'150px'}
               list={[
                 {
                   label: (