From 7f30d03abda16fce2b91b50f0f2ca91b4180b6e9 Mon Sep 17 00:00:00 2001
From: shaohuzhang1 <80892890+shaohuzhang1@users.noreply.github.com>
Date: Thu, 9 May 2024 15:55:35 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=88=86=E8=AF=8D?=
 =?UTF-8?q?=E8=B6=85=E8=BF=87=E6=95=B0=E6=8D=AE=E5=BA=93=E6=9C=80=E5=A4=A7?=
 =?UTF-8?q?=E9=99=90=E5=88=B6=20(#401)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 apps/common/util/ts_vecto_util.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/apps/common/util/ts_vecto_util.py b/apps/common/util/ts_vecto_util.py
index 9def9585f..451d87bf8 100644
--- a/apps/common/util/ts_vecto_util.py
+++ b/apps/common/util/ts_vecto_util.py
@@ -85,10 +85,11 @@ def to_ts_vector(text: str):
     # 替换字符串
     text = replace_word(word_dict, text)
     # 分词
-    result = jieba.posseg.lcut(text, HMM=True, use_paddle=True)
+    filter_word = jieba.analyse.extract_tags(text, topK=100)
+    result = jieba.lcut(text, HMM=True, use_paddle=True)
     # 过滤标点符号
-    result = [item for item in result if not jieba_remove_flag_list.__contains__(item.flag)]
-    result_ = [{'word': get_key_by_word_dict(result[index].word, word_dict), 'index': index} for index in
+    result = [item for item in result if filter_word.__contains__(item) and len(item) < 10]
+    result_ = [{'word': get_key_by_word_dict(result[index], word_dict), 'index': index} for index in
                range(len(result))]
     result_group = group_by(result_, lambda r: r['word'])
     return " ".join(