MaxKB/apps/common/util/ts_vecto_util.py
2025-04-07 18:51:32 +08:00

89 lines
2.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
"""
@project: maxkb
@Author
@file ts_vecto_util.py
@date2024/4/16 15:26
@desc:
"""
import re
import uuid
from typing import List
import jieba
import jieba.posseg
jieba_word_list_cache = [chr(item) for item in range(38, 84)]
for jieba_word in jieba_word_list_cache:
jieba.add_word('#' + jieba_word + '#')
# r"(?i)\b(?:https?|ftp|tcp|file)://[^\s]+\b",
# 某些不分词数据
# r'"([^"]*)"'
word_pattern_list = [r"v\d+.\d+.\d+",
r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}"]
remove_chars = '\n , :\'<>@#¥%……&*!@#$%^&*() /"./'
jieba_remove_flag_list = ['x', 'w']
def get_word_list(text: str):
result = []
for pattern in word_pattern_list:
word_list = re.findall(pattern, text)
for child_list in word_list:
for word in child_list if isinstance(child_list, tuple) else [child_list]:
# 不能有: 所以再使用: 进行分割
if word.__contains__(':'):
item_list = word.split(":")
for w in item_list:
result.append(w)
else:
result.append(word)
return result
def replace_word(word_dict, text: str):
for key in word_dict:
pattern = '(?<!#)' + re.escape(word_dict[key]) + '(?!#)'
text = re.sub(pattern, key, text)
return text
def get_word_key(text: str, use_word_list):
j_word = next((j for j in jieba_word_list_cache if j not in text and all(j not in used for used in use_word_list)),
None)
if j_word:
return j_word
j_word = str(uuid.uuid1())
jieba.add_word(j_word)
return j_word
def to_word_dict(word_list: List, text: str):
word_dict = {}
for word in word_list:
key = get_word_key(text, set(word_dict))
word_dict['#' + key + '#'] = word
return word_dict
def get_key_by_word_dict(key, word_dict):
v = word_dict.get(key)
if v is None:
return key
return v
def to_ts_vector(text: str):
# 分词
result = jieba.lcut(text, cut_all=True)
return " ".join(result)
def to_query(text: str):
extract_tags = jieba.lcut(text, cut_all=True)
result = " ".join(extract_tags)
return result