diff --git a/apps/knowledge/serializers/common.py b/apps/knowledge/serializers/common.py index 9c80e5ac0..2f52e2b44 100644 --- a/apps/knowledge/serializers/common.py +++ b/apps/knowledge/serializers/common.py @@ -242,7 +242,7 @@ def create_knowledge_index(knowledge_id=None, document_id=None): if len(result) == 0: return dims = result[0]['dims'] - sql = f"""CREATE INDEX "embedding_hnsw_idx_{k_id}" ON embedding USING hnsw ((embedding::vector({dims})) vector_l2_ops) WHERE knowledge_id = '{k_id}'""" + sql = f"""CREATE INDEX "embedding_hnsw_idx_{k_id}" ON embedding USING hnsw ((embedding::vector({dims})) vector_cosine_ops) WHERE knowledge_id = '{k_id}'""" update_execute(sql, []) maxkb_logger.info(f'Created index for knowledge ID: {k_id}') diff --git a/apps/knowledge/sql/blend_search.sql b/apps/knowledge/sql/blend_search.sql index afb1f0040..c70e66464 100644 --- a/apps/knowledge/sql/blend_search.sql +++ b/apps/knowledge/sql/blend_search.sql @@ -5,15 +5,17 @@ SELECT FROM ( SELECT DISTINCT ON - ( "paragraph_id" ) ( similarity ),* , - similarity AS comprehensive_score + ( "paragraph_id" ) ( 1 - distince + ts_similarity ) as similarity, *, + (1 - distince + ts_similarity) AS comprehensive_score FROM ( SELECT *, - (( 1 - ( embedding.embedding <=> %s ) )+ts_rank_cd( embedding.search_vector, websearch_to_tsquery('simple', %s ), 32 )) AS similarity + (embedding.embedding::vector(%s) <=> %s) as distince, + (ts_rank_cd( embedding.search_vector, websearch_to_tsquery('simple', %s ), 32 )) AS ts_similarity FROM embedding ${embedding_query} + ORDER BY distince ) TEMP ORDER BY paragraph_id, diff --git a/apps/knowledge/sql/embedding_search.sql b/apps/knowledge/sql/embedding_search.sql index ce3d4a580..1b5689959 100644 --- a/apps/knowledge/sql/embedding_search.sql +++ b/apps/knowledge/sql/embedding_search.sql @@ -5,12 +5,12 @@ SELECT FROM ( SELECT DISTINCT ON - ("paragraph_id") ( similarity ),* ,similarity AS comprehensive_score + ("paragraph_id") ( 1 - distince ),* ,(1 - distince) AS comprehensive_score FROM - ( SELECT *, ( 1 - ( embedding.embedding <=> %s ) ) AS similarity FROM embedding ${embedding_query}) TEMP + ( SELECT *, ( embedding.embedding::vector(%s) <=> %s ) AS distince FROM embedding ${embedding_query} ORDER BY distince) TEMP ORDER BY paragraph_id, - similarity DESC + distince ) DISTINCT_TEMP WHERE comprehensive_score>%s ORDER BY comprehensive_score DESC diff --git a/apps/knowledge/vector/pg_vector.py b/apps/knowledge/vector/pg_vector.py index d8235d2c4..f787cd83f 100644 --- a/apps/knowledge/vector/pg_vector.py +++ b/apps/knowledge/vector/pg_vector.py @@ -172,8 +172,13 @@ class EmbeddingSearch(ISearch): os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'embedding_search.sql')), with_table_name=True) - embedding_model = select_list(exec_sql, - [json.dumps(query_embedding), *exec_params, similarity, top_number]) + embedding_model = select_list(exec_sql, [ + len(query_embedding), + json.dumps(query_embedding), + *exec_params, + similarity, + top_number + ]) return embedding_model def support(self, search_mode: SearchMode): @@ -193,8 +198,12 @@ class KeywordsSearch(ISearch): os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'keywords_search.sql')), with_table_name=True) - embedding_model = select_list(exec_sql, - [to_query(query_text), *exec_params, similarity, top_number]) + embedding_model = select_list(exec_sql, [ + to_query(query_text), + *exec_params, + similarity, + top_number + ]) return embedding_model def support(self, search_mode: SearchMode): @@ -214,9 +223,13 @@ class BlendSearch(ISearch): os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'blend_search.sql')), with_table_name=True) - embedding_model = select_list(exec_sql, - [json.dumps(query_embedding), to_query(query_text), *exec_params, similarity, - top_number]) + embedding_model = select_list(exec_sql, [ + len(query_embedding), + json.dumps(query_embedding), + to_query(query_text), + *exec_params, similarity, + top_number + ]) return embedding_model def support(self, search_mode: SearchMode):