mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
fix: 修复分词超过数据库最大限制 (#401)
This commit is contained in:
parent
8159fef722
commit
7f30d03abd
|
|
@ -85,10 +85,11 @@ def to_ts_vector(text: str):
|
|||
# 替换字符串
|
||||
text = replace_word(word_dict, text)
|
||||
# 分词
|
||||
result = jieba.posseg.lcut(text, HMM=True, use_paddle=True)
|
||||
filter_word = jieba.analyse.extract_tags(text, topK=100)
|
||||
result = jieba.lcut(text, HMM=True, use_paddle=True)
|
||||
# 过滤标点符号
|
||||
result = [item for item in result if not jieba_remove_flag_list.__contains__(item.flag)]
|
||||
result_ = [{'word': get_key_by_word_dict(result[index].word, word_dict), 'index': index} for index in
|
||||
result = [item for item in result if filter_word.__contains__(item) and len(item) < 10]
|
||||
result_ = [{'word': get_key_by_word_dict(result[index], word_dict), 'index': index} for index in
|
||||
range(len(result))]
|
||||
result_group = group_by(result_, lambda r: r['word'])
|
||||
return " ".join(
|
||||
|
|
|
|||
Loading…
Reference in New Issue