feat: add create and drop knowledge index functions for improved database management
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run

This commit is contained in:
CaptainB 2025-07-23 21:42:33 +08:00
parent fc4a7df669
commit 6ce1eb7116
4 changed files with 51 additions and 2 deletions

View File

@ -26,6 +26,7 @@ from common.utils.logger import maxkb_logger
from common.utils.page_utils import page_desc
from knowledge.models import Paragraph, Status, Document, ProblemParagraphMapping, TaskType, State, SourceType, \
SearchMode
from knowledge.serializers.common import create_knowledge_index
from maxkb.conf import (PROJECT_DIR)
lock = threading.Lock()
@ -290,6 +291,8 @@ class ListenerManagement:
ListenerManagement.get_aggregation_document_status(
document_id)),
is_the_task_interrupted)
# 检查是否存在索引
create_knowledge_index(document_id=document_id)
except Exception as e:
maxkb_logger.error(_('Vectorized document: {document_id} error {error} {traceback}').format(
document_id=document_id, error=str(e), traceback=traceback.format_exc()))

View File

@ -18,10 +18,12 @@ from rest_framework import serializers
from common.config.embedding_config import ModelManage
from common.db.search import native_search
from common.db.sql_execute import update_execute
from common.db.sql_execute import sql_execute, update_execute
from common.exception.app_exception import AppApiException
from common.utils.common import get_file_content
from common.utils.fork import Fork
from common.utils.logger import maxkb_logger
from knowledge.models import Document
from knowledge.models import Paragraph, Problem, ProblemParagraphMapping, Knowledge, File
from maxkb.conf import PROJECT_DIR
from models_provider.tools import get_model
@ -220,3 +222,44 @@ def get_knowledge_operation_object(knowledge_id: str):
"update_time": knowledge_model.update_time
}
return {}
def create_knowledge_index(knowledge_id=None, document_id=None):
if knowledge_id is None and document_id is None:
raise AppApiException(500, _('Knowledge ID or Document ID must be provided'))
if knowledge_id is not None:
k_id = knowledge_id
else:
document = QuerySet(Document).filter(id=document_id).first()
k_id = document.knowledge_id
sql = f"SELECT indexname, indexdef FROM pg_indexes WHERE tablename = 'embedding' AND indexname = 'embedding_hnsw_idx_{k_id}'"
index = sql_execute(sql, [])
if not index:
sql = f"SELECT vector_dims(embedding) AS dims FROM embedding WHERE knowledge_id = '{k_id}' LIMIT 1"
result = sql_execute(sql, [])
if len(result) == 0:
return
dims = result[0]['dims']
sql = f"""CREATE INDEX "embedding_hnsw_idx_{k_id}" ON embedding USING hnsw ((embedding::vector({dims})) vector_l2_ops) WHERE knowledge_id = '{k_id}'"""
update_execute(sql, [])
maxkb_logger.info(f'Created index for knowledge ID: {k_id}')
def drop_knowledge_index(knowledge_id=None, document_id=None):
if knowledge_id is None and document_id is None:
raise AppApiException(500, _('Knowledge ID or Document ID must be provided'))
if knowledge_id is not None:
k_id = knowledge_id
else:
document = QuerySet(Document).filter(id=document_id).first()
k_id = document.knowledge_id
sql = f"SELECT indexname, indexdef FROM pg_indexes WHERE tablename = 'embedding' AND indexname = 'embedding_hnsw_idx_{k_id}'"
index = sql_execute(sql, [])
if index:
sql = f'DROP INDEX "embedding_hnsw_idx_{k_id}"'
update_execute(sql, [])
maxkb_logger.info(f'Dropped index for knowledge ID: {k_id}')

View File

@ -30,7 +30,7 @@ from common.utils.logger import maxkb_logger
from common.utils.split_model import get_split_model
from knowledge.models import Knowledge, KnowledgeScope, KnowledgeType, Document, Paragraph, Problem, \
ProblemParagraphMapping, TaskType, State, SearchMode, KnowledgeFolder, File
from knowledge.serializers.common import ProblemParagraphManage, get_embedding_model_id_by_knowledge_id, MetaSerializer, \
from knowledge.serializers.common import ProblemParagraphManage, drop_knowledge_index, get_embedding_model_id_by_knowledge_id, MetaSerializer, \
GenerateRelatedSerializer, get_embedding_model_by_knowledge_id, list_paragraph, write_image, zip_dir
from knowledge.serializers.document import DocumentSerializers
from knowledge.task.embedding import embedding_by_knowledge, delete_embedding_by_knowledge
@ -418,6 +418,7 @@ class KnowledgeSerializer(serializers.Serializer):
QuerySet(Problem).filter(knowledge=knowledge).delete()
QuerySet(WorkspaceUserResourcePermission).filter(target=knowledge.id).delete()
QuerySet(ApplicationKnowledgeMapping).filter(knowledge_id=knowledge.id).delete()
drop_knowledge_index(knowledge_id=knowledge.id)
knowledge.delete()
File.objects.filter(
source_id=knowledge.id,

View File

@ -13,6 +13,7 @@ from common.event import ListenerManagement, UpdateProblemArgs, UpdateEmbeddingK
UpdateEmbeddingDocumentIdArgs
from common.utils.logger import maxkb_logger
from knowledge.models import Document, TaskType, State
from knowledge.serializers.common import drop_knowledge_index
from models_provider.tools import get_model
from models_provider.models import Model
from ops import celery_app
@ -102,6 +103,7 @@ def embedding_by_knowledge(knowledge_id, model_id):
maxkb_logger.info(_('Start--->Vectorized knowledge: {knowledge_id}').format(knowledge_id=knowledge_id))
try:
ListenerManagement.delete_embedding_by_knowledge(knowledge_id)
drop_knowledge_index(knowledge_id=knowledge_id)
document_list = QuerySet(Document).filter(knowledge_id=knowledge_id)
maxkb_logger.info(_('Knowledge documentation: {document_names}').format(
document_names=", ".join([d.name for d in document_list])))