fix: update SQL queries to improve similarity calculations and indexing
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run

This commit is contained in:
CaptainB 2025-07-24 16:18:11 +08:00
parent b3a5dc4a1c
commit 4d18b78d29
4 changed files with 29 additions and 14 deletions

View File

@ -242,7 +242,7 @@ def create_knowledge_index(knowledge_id=None, document_id=None):
if len(result) == 0:
return
dims = result[0]['dims']
sql = f"""CREATE INDEX "embedding_hnsw_idx_{k_id}" ON embedding USING hnsw ((embedding::vector({dims})) vector_l2_ops) WHERE knowledge_id = '{k_id}'"""
sql = f"""CREATE INDEX "embedding_hnsw_idx_{k_id}" ON embedding USING hnsw ((embedding::vector({dims})) vector_cosine_ops) WHERE knowledge_id = '{k_id}'"""
update_execute(sql, [])
maxkb_logger.info(f'Created index for knowledge ID: {k_id}')

View File

@ -5,15 +5,17 @@ SELECT
FROM
(
SELECT DISTINCT ON
( "paragraph_id" ) ( similarity ),* ,
similarity AS comprehensive_score
( "paragraph_id" ) ( 1 - distince + ts_similarity ) as similarity, *,
(1 - distince + ts_similarity) AS comprehensive_score
FROM
(
SELECT
*,
(( 1 - ( embedding.embedding <=> %s ) )+ts_rank_cd( embedding.search_vector, websearch_to_tsquery('simple', %s ), 32 )) AS similarity
(embedding.embedding::vector(%s) <=> %s) as distince,
(ts_rank_cd( embedding.search_vector, websearch_to_tsquery('simple', %s ), 32 )) AS ts_similarity
FROM
embedding ${embedding_query}
ORDER BY distince
) TEMP
ORDER BY
paragraph_id,

View File

@ -5,12 +5,12 @@ SELECT
FROM
(
SELECT DISTINCT ON
("paragraph_id") ( similarity ),* ,similarity AS comprehensive_score
("paragraph_id") ( 1 - distince ),* ,(1 - distince) AS comprehensive_score
FROM
( SELECT *, ( 1 - ( embedding.embedding <=> %s ) ) AS similarity FROM embedding ${embedding_query}) TEMP
( SELECT *, ( embedding.embedding::vector(%s) <=> %s ) AS distince FROM embedding ${embedding_query} ORDER BY distince) TEMP
ORDER BY
paragraph_id,
similarity DESC
distince
) DISTINCT_TEMP
WHERE comprehensive_score>%s
ORDER BY comprehensive_score DESC

View File

@ -172,8 +172,13 @@ class EmbeddingSearch(ISearch):
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
'embedding_search.sql')),
with_table_name=True)
embedding_model = select_list(exec_sql,
[json.dumps(query_embedding), *exec_params, similarity, top_number])
embedding_model = select_list(exec_sql, [
len(query_embedding),
json.dumps(query_embedding),
*exec_params,
similarity,
top_number
])
return embedding_model
def support(self, search_mode: SearchMode):
@ -193,8 +198,12 @@ class KeywordsSearch(ISearch):
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
'keywords_search.sql')),
with_table_name=True)
embedding_model = select_list(exec_sql,
[to_query(query_text), *exec_params, similarity, top_number])
embedding_model = select_list(exec_sql, [
to_query(query_text),
*exec_params,
similarity,
top_number
])
return embedding_model
def support(self, search_mode: SearchMode):
@ -214,9 +223,13 @@ class BlendSearch(ISearch):
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
'blend_search.sql')),
with_table_name=True)
embedding_model = select_list(exec_sql,
[json.dumps(query_embedding), to_query(query_text), *exec_params, similarity,
top_number])
embedding_model = select_list(exec_sql, [
len(query_embedding),
json.dumps(query_embedding),
to_query(query_text),
*exec_params, similarity,
top_number
])
return embedding_model
def support(self, search_mode: SearchMode):