# coding=utf-8 """ @project: maxkb @Author:虎 @file: ts_vecto_util.py @date:2024/4/16 15:26 @desc: """ import re import uuid_utils.compat as uuid from typing import List import jieba import jieba.posseg jieba_word_list_cache = [chr(item) for item in range(38, 84)] for jieba_word in jieba_word_list_cache: jieba.add_word('#' + jieba_word + '#') # r"(?i)\b(?:https?|ftp|tcp|file)://[^\s]+\b", # 某些不分词数据 # r'"([^"]*)"' word_pattern_list = [r"v\d+.\d+.\d+", r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}"] remove_chars = '\n , :\'<>!@#¥%……&*()!@#$%^&*(): ;,/"./' jieba_remove_flag_list = ['x', 'w'] def get_word_list(text: str): result = [] for pattern in word_pattern_list: word_list = re.findall(pattern, text) for child_list in word_list: for word in child_list if isinstance(child_list, tuple) else [child_list]: # 不能有: 所以再使用: 进行分割 if word.__contains__(':'): item_list = word.split(":") for w in item_list: result.append(w) else: result.append(word) return result def replace_word(word_dict, text: str): for key in word_dict: pattern = '(?