From 6ce1eb7116b2ae92f475955662896b9acccdf2e6 Mon Sep 17 00:00:00 2001 From: CaptainB Date: Wed, 23 Jul 2025 21:42:33 +0800 Subject: [PATCH] feat: add create and drop knowledge index functions for improved database management --- apps/common/event/listener_manage.py | 3 ++ apps/knowledge/serializers/common.py | 45 ++++++++++++++++++++++++- apps/knowledge/serializers/knowledge.py | 3 +- apps/knowledge/task/embedding.py | 2 ++ 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/apps/common/event/listener_manage.py b/apps/common/event/listener_manage.py index 75bb98d56..9aaa29bac 100644 --- a/apps/common/event/listener_manage.py +++ b/apps/common/event/listener_manage.py @@ -26,6 +26,7 @@ from common.utils.logger import maxkb_logger from common.utils.page_utils import page_desc from knowledge.models import Paragraph, Status, Document, ProblemParagraphMapping, TaskType, State, SourceType, \ SearchMode +from knowledge.serializers.common import create_knowledge_index from maxkb.conf import (PROJECT_DIR) lock = threading.Lock() @@ -290,6 +291,8 @@ class ListenerManagement: ListenerManagement.get_aggregation_document_status( document_id)), is_the_task_interrupted) + # 检查是否存在索引 + create_knowledge_index(document_id=document_id) except Exception as e: maxkb_logger.error(_('Vectorized document: {document_id} error {error} {traceback}').format( document_id=document_id, error=str(e), traceback=traceback.format_exc())) diff --git a/apps/knowledge/serializers/common.py b/apps/knowledge/serializers/common.py index 2d97fc00d..9c80e5ac0 100644 --- a/apps/knowledge/serializers/common.py +++ b/apps/knowledge/serializers/common.py @@ -18,10 +18,12 @@ from rest_framework import serializers from common.config.embedding_config import ModelManage from common.db.search import native_search -from common.db.sql_execute import update_execute +from common.db.sql_execute import sql_execute, update_execute from common.exception.app_exception import AppApiException from common.utils.common import get_file_content from common.utils.fork import Fork +from common.utils.logger import maxkb_logger +from knowledge.models import Document from knowledge.models import Paragraph, Problem, ProblemParagraphMapping, Knowledge, File from maxkb.conf import PROJECT_DIR from models_provider.tools import get_model @@ -220,3 +222,44 @@ def get_knowledge_operation_object(knowledge_id: str): "update_time": knowledge_model.update_time } return {} + + +def create_knowledge_index(knowledge_id=None, document_id=None): + if knowledge_id is None and document_id is None: + raise AppApiException(500, _('Knowledge ID or Document ID must be provided')) + + if knowledge_id is not None: + k_id = knowledge_id + else: + document = QuerySet(Document).filter(id=document_id).first() + k_id = document.knowledge_id + + sql = f"SELECT indexname, indexdef FROM pg_indexes WHERE tablename = 'embedding' AND indexname = 'embedding_hnsw_idx_{k_id}'" + index = sql_execute(sql, []) + if not index: + sql = f"SELECT vector_dims(embedding) AS dims FROM embedding WHERE knowledge_id = '{k_id}' LIMIT 1" + result = sql_execute(sql, []) + if len(result) == 0: + return + dims = result[0]['dims'] + sql = f"""CREATE INDEX "embedding_hnsw_idx_{k_id}" ON embedding USING hnsw ((embedding::vector({dims})) vector_l2_ops) WHERE knowledge_id = '{k_id}'""" + update_execute(sql, []) + maxkb_logger.info(f'Created index for knowledge ID: {k_id}') + + +def drop_knowledge_index(knowledge_id=None, document_id=None): + if knowledge_id is None and document_id is None: + raise AppApiException(500, _('Knowledge ID or Document ID must be provided')) + + if knowledge_id is not None: + k_id = knowledge_id + else: + document = QuerySet(Document).filter(id=document_id).first() + k_id = document.knowledge_id + + sql = f"SELECT indexname, indexdef FROM pg_indexes WHERE tablename = 'embedding' AND indexname = 'embedding_hnsw_idx_{k_id}'" + index = sql_execute(sql, []) + if index: + sql = f'DROP INDEX "embedding_hnsw_idx_{k_id}"' + update_execute(sql, []) + maxkb_logger.info(f'Dropped index for knowledge ID: {k_id}') diff --git a/apps/knowledge/serializers/knowledge.py b/apps/knowledge/serializers/knowledge.py index 73ebea4d4..bd679ee3b 100644 --- a/apps/knowledge/serializers/knowledge.py +++ b/apps/knowledge/serializers/knowledge.py @@ -30,7 +30,7 @@ from common.utils.logger import maxkb_logger from common.utils.split_model import get_split_model from knowledge.models import Knowledge, KnowledgeScope, KnowledgeType, Document, Paragraph, Problem, \ ProblemParagraphMapping, TaskType, State, SearchMode, KnowledgeFolder, File -from knowledge.serializers.common import ProblemParagraphManage, get_embedding_model_id_by_knowledge_id, MetaSerializer, \ +from knowledge.serializers.common import ProblemParagraphManage, drop_knowledge_index, get_embedding_model_id_by_knowledge_id, MetaSerializer, \ GenerateRelatedSerializer, get_embedding_model_by_knowledge_id, list_paragraph, write_image, zip_dir from knowledge.serializers.document import DocumentSerializers from knowledge.task.embedding import embedding_by_knowledge, delete_embedding_by_knowledge @@ -418,6 +418,7 @@ class KnowledgeSerializer(serializers.Serializer): QuerySet(Problem).filter(knowledge=knowledge).delete() QuerySet(WorkspaceUserResourcePermission).filter(target=knowledge.id).delete() QuerySet(ApplicationKnowledgeMapping).filter(knowledge_id=knowledge.id).delete() + drop_knowledge_index(knowledge_id=knowledge.id) knowledge.delete() File.objects.filter( source_id=knowledge.id, diff --git a/apps/knowledge/task/embedding.py b/apps/knowledge/task/embedding.py index fc583803b..c0949574a 100644 --- a/apps/knowledge/task/embedding.py +++ b/apps/knowledge/task/embedding.py @@ -13,6 +13,7 @@ from common.event import ListenerManagement, UpdateProblemArgs, UpdateEmbeddingK UpdateEmbeddingDocumentIdArgs from common.utils.logger import maxkb_logger from knowledge.models import Document, TaskType, State +from knowledge.serializers.common import drop_knowledge_index from models_provider.tools import get_model from models_provider.models import Model from ops import celery_app @@ -102,6 +103,7 @@ def embedding_by_knowledge(knowledge_id, model_id): maxkb_logger.info(_('Start--->Vectorized knowledge: {knowledge_id}').format(knowledge_id=knowledge_id)) try: ListenerManagement.delete_embedding_by_knowledge(knowledge_id) + drop_knowledge_index(knowledge_id=knowledge_id) document_list = QuerySet(Document).filter(knowledge_id=knowledge_id) maxkb_logger.info(_('Knowledge documentation: {document_names}').format( document_names=", ".join([d.name for d in document_list])))