diff --git a/apps/common/util/ts_vecto_util.py b/apps/common/util/ts_vecto_util.py index b5d4de3fd..9def9585f 100644 --- a/apps/common/util/ts_vecto_util.py +++ b/apps/common/util/ts_vecto_util.py @@ -11,6 +11,7 @@ import uuid from typing import List import jieba +import jieba.posseg from jieba import analyse from common.util.split_model import group_by @@ -25,7 +26,9 @@ for jieba_word in jieba_word_list_cache: word_pattern_list = [r"v\d+.\d+.\d+", r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}"] -remove_chars = '\n , :\'<>!@#¥%……&*()!@#$%^&*(): ;,/"./-' +remove_chars = '\n , :\'<>!@#¥%……&*()!@#$%^&*(): ;,/"./' + +jieba_remove_flag_list = ['x', 'w'] def get_word_list(text: str): @@ -82,8 +85,11 @@ def to_ts_vector(text: str): # 替换字符串 text = replace_word(word_dict, text) # 分词 - result = jieba.tokenize(text, mode='search') - result_ = [{'word': get_key_by_word_dict(item[0], word_dict), 'index': item[1]} for item in result] + result = jieba.posseg.lcut(text, HMM=True, use_paddle=True) + # 过滤标点符号 + result = [item for item in result if not jieba_remove_flag_list.__contains__(item.flag)] + result_ = [{'word': get_key_by_word_dict(result[index].word, word_dict), 'index': index} for index in + range(len(result))] result_group = group_by(result_, lambda r: r['word']) return " ".join( [f"{key.lower()}:{','.join([str(item['index'] + 1) for item in result_group[key]][:20])}" for key in diff --git a/apps/embedding/migrations/0002_embedding_search_vector.py b/apps/embedding/migrations/0002_embedding_search_vector.py index 7d06d6046..c73a5a031 100644 --- a/apps/embedding/migrations/0002_embedding_search_vector.py +++ b/apps/embedding/migrations/0002_embedding_search_vector.py @@ -1,4 +1,5 @@ # Generated by Django 4.1.13 on 2024-04-16 11:43 +import threading import django.contrib.postgres.search from django.db import migrations @@ -44,6 +45,11 @@ def save_keywords(apps, schema_editor): print(e) +def async_save_keywords(apps, schema_editor): + thread = threading.Thread(target=save_keywords, args=(apps, schema_editor)) + thread.start() + + class Migration(migrations.Migration): dependencies = [ ('embedding', '0001_initial'), @@ -55,5 +61,5 @@ class Migration(migrations.Migration): name='search_vector', field=django.contrib.postgres.search.SearchVectorField(default='', verbose_name='分词'), ), - migrations.RunPython(save_keywords) + migrations.RunPython(async_save_keywords) ]