mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
* feat: init knowledge workflow * feat: add knowledge workflow and version models, serializers, and API views * feat: knowledge workflow * feat: knowledge workflow * feat: add KnowledgeWorkflowModelSerializer and Operate class for workflow management * fix: route * feat: knowledge workflow * feat: Knowledge workflow permission * feat: knowledge workflow * feat: knowledge workflow * feat: knowledge workflow * feat: knowledge workflow * feat: Data source web node * fix: Back route * feat: knowledge workflow * feat: knowledge workflow * feat: Knowledge write node * feat: add Data Source tool functionality and localization * feat: add Data Source tool functionality and localization * feat: knowledge workflow * feat: knowledge workflow * fix: simplify export tool permission check in ToolListContainer.vue * fix: simplify export condition in ToolResourceIndex.vue * fix: simplify condition for copying tool in ToolListContainer * feat: knowledge workflow * fix: Upload local files and add output fields * feat: Knowledge write * feat: add Document Split Node functionality and localization * feat: add Document Split Node functionality and localization * feat: Knowledge write * feat: enhance Document Split Node with result processing and problem list generation * fix: Allow problem be blank * feat: enhance Document Split Node with result processing and problem list generation * feat: tool datasource * fix: Optimization of knowledge base workflow execution logic * refactor: streamline image handling by updating application and knowledge ID management * refactor: streamline image handling by updating application and knowledge ID management * feat: extend support modes in variable aggregation node to include knowledge workflows * feat: Chunks stored * refactor: simplify file handling in document extraction by removing unnecessary byte conversion and enhancing file saving logic * refactor: update file ID assignment in document extraction to use provided metadata * feat: Workflow menu that distinguishes between applications and knowledge bases * refactor: update file ID assignment in document extraction to use provided metadata * fix: Add workspace ID as workflow execution parameter * feat: add code template for Data Source tool form functionality * refactor: remove unused sys import and improve module handling * feat: Execution details support loading status * refactor: update tool type handling and improve category merging logic * feat: Alter fork depth * fix: ensure filterList is properly initialized and updated in getList function * refactor: simplify ToolStoreDialog by removing unused toolType logic * perf: Optimize the style * style: adjust div width for improved layout in Tree component * refactor: improve polling mechanism for knowledge workflow action * fix: Get workspace_id from workflow params * fix: filter out 'file_bytes' from result in get_details method * feat: add recursive filtering for file_bytes in context data * fix: append results to paragraph_list instead of replacing it * perf: Optimize translation files * fix: include document name in bytes_to_uploaded_file call for better file handling * refactor: optimize buffer retrieval in document processing * refactor: remove redundant parameter from bytes_to_uploaded_file call * fix: Page style optimization * feat: add slider for setting limit in document rules form * feat: add workflow knowledge management endpoints and related functionality * fix: swap file size and file count limits in form inputs * refactor: update tool_config args to use list format for improved readability * feat: Node supports knowledge base workflow * feat: Node supports knowledge base workflow * fix: Basic node data cannot be obtained in the workflow * style: Knowledge base workflow debugging page style adjustment * fix: Loop nodes cannot be used in the knowledge base workflow * fix: Knowledge base workflow variable assignment node * feat: add chunk size slider to form for custom split strategy * fix: Workflow style optimization --------- Co-authored-by: CaptainB <bin@fit2cloud.com> Co-authored-by: zhangzhanwei <zhanwei.zhang@fit2cloud.com> Co-authored-by: wangdan-fit2cloud <dan.wang@fit2cloud.com>
191 lines
5.9 KiB
Python
191 lines
5.9 KiB
Python
# coding=utf-8
|
||
"""
|
||
@project: maxkb
|
||
@Author:虎
|
||
@file: base_vector.py
|
||
@date:2023/10/18 19:16
|
||
@desc:
|
||
"""
|
||
import threading
|
||
from abc import ABC, abstractmethod
|
||
from functools import reduce
|
||
from typing import List, Dict
|
||
|
||
from langchain_core.embeddings import Embeddings
|
||
|
||
from common.chunk import text_to_chunk
|
||
from common.utils.common import sub_array
|
||
from knowledge.models import SourceType, SearchMode
|
||
|
||
lock = threading.Lock()
|
||
|
||
|
||
def chunk_data(data: Dict):
|
||
if str(data.get('source_type')) == str(SourceType.PARAGRAPH.value):
|
||
text = data.get('text')
|
||
chunk_list = data.get('chunks') if data.get('chunks') else text_to_chunk(text)
|
||
return [{**data, 'text': chunk} for chunk in chunk_list]
|
||
return [data]
|
||
|
||
|
||
def chunk_data_list(data_list: List[Dict]):
|
||
result = [chunk_data(data) for data in data_list]
|
||
return reduce(lambda x, y: [*x, *y], result, [])
|
||
|
||
|
||
class BaseVectorStore(ABC):
|
||
vector_exists = False
|
||
|
||
@abstractmethod
|
||
def vector_is_create(self) -> bool:
|
||
"""
|
||
判断向量库是否创建
|
||
:return: 是否创建向量库
|
||
"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
def vector_create(self):
|
||
"""
|
||
创建 向量库
|
||
:return:
|
||
"""
|
||
pass
|
||
|
||
def save_pre_handler(self):
|
||
"""
|
||
插入前置处理器 主要是判断向量库是否创建
|
||
:return: True
|
||
"""
|
||
if not BaseVectorStore.vector_exists:
|
||
if not self.vector_is_create():
|
||
self.vector_create()
|
||
BaseVectorStore.vector_exists = True
|
||
return True
|
||
|
||
def save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str,
|
||
source_id: str,
|
||
is_active: bool,
|
||
embedding: Embeddings):
|
||
"""
|
||
插入向量数据
|
||
:param source_id: 资源id
|
||
:param knowledge_id: 知识库id
|
||
:param text: 文本
|
||
:param source_type: 资源类型
|
||
:param document_id: 文档id
|
||
:param is_active: 是否禁用
|
||
:param embedding: 向量化处理器
|
||
:param paragraph_id 段落id
|
||
:return: bool
|
||
"""
|
||
self.save_pre_handler()
|
||
data = {'document_id': document_id, 'paragraph_id': paragraph_id, 'knowledge_id': knowledge_id,
|
||
'is_active': is_active, 'source_id': source_id, 'source_type': source_type, 'text': text}
|
||
chunk_list = chunk_data(data)
|
||
result = sub_array(chunk_list)
|
||
for child_array in result:
|
||
self._batch_save(child_array, embedding, lambda: False)
|
||
|
||
def batch_save(self, data_list: List[Dict], embedding: Embeddings, is_the_task_interrupted):
|
||
"""
|
||
批量插入
|
||
@param data_list: 数据列表
|
||
@param embedding: 向量化处理器
|
||
@param is_the_task_interrupted: 判断是否中断任务
|
||
:return: bool
|
||
"""
|
||
self.save_pre_handler()
|
||
chunk_list = chunk_data_list(data_list)
|
||
result = sub_array(chunk_list)
|
||
for child_array in result:
|
||
if not is_the_task_interrupted():
|
||
self._batch_save(child_array, embedding, is_the_task_interrupted)
|
||
else:
|
||
break
|
||
|
||
@abstractmethod
|
||
def _save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str,
|
||
source_id: str,
|
||
is_active: bool,
|
||
embedding: Embeddings):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def _batch_save(self, text_list: List[Dict], embedding: Embeddings, is_the_task_interrupted):
|
||
pass
|
||
|
||
def search(self, query_text, knowledge_id_list: list[str], exclude_document_id_list: list[str],
|
||
exclude_paragraph_list: list[str],
|
||
is_active: bool,
|
||
embedding: Embeddings):
|
||
if knowledge_id_list is None or len(knowledge_id_list) == 0:
|
||
return []
|
||
embedding_query = embedding.embed_query(query_text)
|
||
result = self.query(embedding_query, knowledge_id_list, exclude_document_id_list, exclude_paragraph_list,
|
||
is_active, 1, 3, 0.65)
|
||
return result[0]
|
||
|
||
@abstractmethod
|
||
def query(self, query_text: str, query_embedding: List[float], knowledge_id_list: list[str],
|
||
document_id_list: list[str] | None,
|
||
exclude_document_id_list: list[str],
|
||
exclude_paragraph_list: list[str], is_active: bool, top_n: int, similarity: float,
|
||
search_mode: SearchMode):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def hit_test(self, query_text, knowledge_id: list[str], exclude_document_id_list: list[str], top_number: int,
|
||
similarity: float,
|
||
search_mode: SearchMode,
|
||
embedding: Embeddings):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def update_by_paragraph_id(self, paragraph_id: str, instance: Dict):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def update_by_paragraph_ids(self, paragraph_ids: str, instance: Dict):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def update_by_source_id(self, source_id: str, instance: Dict):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def update_by_source_ids(self, source_ids: List[str], instance: Dict):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def delete_by_knowledge_id(self, knowledge_id: str):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def delete_by_document_id(self, document_id: str):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def delete_by_document_id_list(self, document_id_list: List[str]):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def delete_by_knowledge_id_list(self, knowledge_id_list: List[str]):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def delete_by_source_id(self, source_id: str, source_type: str):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def delete_by_source_ids(self, source_ids: List[str], source_type: str):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def delete_by_paragraph_id(self, paragraph_id: str):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def delete_by_paragraph_ids(self, paragraph_ids: List[str]):
|
||
pass
|