From 7f30d03abda16fce2b91b50f0f2ca91b4180b6e9 Mon Sep 17 00:00:00 2001 From: shaohuzhang1 <80892890+shaohuzhang1@users.noreply.github.com> Date: Thu, 9 May 2024 15:55:35 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=88=86=E8=AF=8D?= =?UTF-8?q?=E8=B6=85=E8=BF=87=E6=95=B0=E6=8D=AE=E5=BA=93=E6=9C=80=E5=A4=A7?= =?UTF-8?q?=E9=99=90=E5=88=B6=20(#401)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/util/ts_vecto_util.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/common/util/ts_vecto_util.py b/apps/common/util/ts_vecto_util.py index 9def9585f..451d87bf8 100644 --- a/apps/common/util/ts_vecto_util.py +++ b/apps/common/util/ts_vecto_util.py @@ -85,10 +85,11 @@ def to_ts_vector(text: str): # 替换字符串 text = replace_word(word_dict, text) # 分词 - result = jieba.posseg.lcut(text, HMM=True, use_paddle=True) + filter_word = jieba.analyse.extract_tags(text, topK=100) + result = jieba.lcut(text, HMM=True, use_paddle=True) # 过滤标点符号 - result = [item for item in result if not jieba_remove_flag_list.__contains__(item.flag)] - result_ = [{'word': get_key_by_word_dict(result[index].word, word_dict), 'index': index} for index in + result = [item for item in result if filter_word.__contains__(item) and len(item) < 10] + result_ = [{'word': get_key_by_word_dict(result[index], word_dict), 'index': index} for index in range(len(result))] result_group = group_by(result_, lambda r: r['word']) return " ".join(