diff --git a/docs/api_userguide.md b/docs/api_userguide.md index 5931044..9684ebe 100644 --- a/docs/api_userguide.md +++ b/docs/api_userguide.md @@ -250,8 +250,10 @@ detect_obj.rate_predict(data) ![data_info](images/python_api_rate_predict.png) ### 三、LIB库 -Metis工程目录下time_series_detector/lib目录为时间序列异常检测学件,可以在python代码或C代码中调用 -libdetect.so在CentOs7.2下编译,目前仅支持在CentOs7.2或更高版本Centos使用 +Metis工程目录下time_series_detector/lib为学件动态库目录,库文件可以在代码中加载调用 + +libdetect.so目前支持在CentOs7.2+系统环境下使用 + #### Python代码中调用: @@ -277,7 +279,7 @@ handle = metis_lib.load_model("./xgb_default_model") from ctypes import * class ValueData(Structure): - _fields_ = [('value_a', POINTER(c_int)), ('value_b', POINTER(c_int)), ('value_c', POINTER(c_int)), + _fields_ = [('data_a', POINTER(c_int)), ('data_b', POINTER(c_int)), ('data_c', POINTER(c_int)), ('len_a', c_int), ('len_b', c_int), ('len_c', c_int)] # test data @@ -309,9 +311,9 @@ print result, prob ``` typedef struct { - int* value_a; - int* value_b; - int* value_c; + int* data_a; + int* data_b; + int* data_c; int len_a; int len_b; int len_c; @@ -322,7 +324,7 @@ typedef struct { | 名称 | 类型 |必填| 默认值 | 说明 | | --- | --- | --- |---- | --- | | handle| int| 是| 无|模型句柄,由load_model返回| -| ValueData| struct| 是| 无|待检测数据| +| data_value| ValueData| 是| 无|待检测数据| @@ -361,7 +363,7 @@ metis_lib = so("./libdetect.so") from ctypes import * class RateData(Structure): -_fields_ = [('value_a', POINTER(c_double)), ('value_b', POINTER(c_double)), ('value_c', POINTER(c_double)), +_fields_ = [('data_a', POINTER(c_double)), ('data_b', POINTER(c_double)), ('data_c', POINTER(c_double)), ('len_a', c_int), ('len_b', c_int), ('len_c', c_int)] # test data @@ -393,9 +395,9 @@ print result, prob ``` typedef struct { - double* value_a; - double* value_b; - double* value_c; + double* data_a; + double* data_b; + double* data_c; int len_a; int len_b; int len_c; @@ -405,7 +407,7 @@ typedef struct { | 名称 | 类型 |必填| 默认值 | 说明 | | --- | --- | --- |---- | --- | -| ValueData| struct| 是| 无|待检测数据| +| data_value| RateData| 是| 无|待检测数据| @@ -424,9 +426,9 @@ typedef struct { | result | c_int | 检测结果是否异常。0:异常;1:正常 | | prob | c_float | 概率值,值越小,判定为异常的置信度越高 | -####C代码中调用: +#### C代码中调用: -在C中调用检测函数,需要include头文件detect.h,在编译时链接libdetect.so文件。 +在C中调用检测函数,需要include头文件detect.h,在编译时链接libdetect.so文件 ##### 1、量值检测 * 功能说明:根据参考数据检测最近一个数据点是否异常 @@ -451,9 +453,9 @@ typedef struct { ``` typedef struct { - int* value_a; - int* value_b; - int* value_c; + int* data_a; + int* data_b; + int* data_c; int len_a; int len_b; int len_c; @@ -464,7 +466,7 @@ typedef struct { | 名称 | 类型 |必填| 默认值 | 说明 | | --- | --- | --- |---- | --- | | handle| int| 是| 无|模型句柄,由load_model返回| -| ValueData| struct| 是| 无|待检测数据| +| value_data| ValueData| 是| 无|待检测数据| @@ -501,9 +503,9 @@ printf ("ret=%d result =%d prob = %f \n", ret, sample_result, prob); ``` typedef struct { - double* value_a; - double* value_b; - double* value_c; + double* data_a; + double* data_b; + double* data_c; int len_a; int len_b; int len_c; @@ -513,7 +515,7 @@ typedef struct { | 名称 | 类型 |必填| 默认值 | 说明 | | --- | --- | --- |---- | --- | -| ValueData| struct| 是| 无|待检测数据| +| rate_data| RateData| 是| 无|待检测数据| * 返回参数: diff --git a/time_series_detector/common/tsd_common.py b/time_series_detector/common/tsd_common.py index c8a5064..41ce338 100644 --- a/time_series_detector/common/tsd_common.py +++ b/time_series_detector/common/tsd_common.py @@ -78,3 +78,19 @@ def normalize_time_series(split_time_series): normalized_data_a ] return normalized_split_time_series + + +def normalize_time_series_by_max_min(split_time_series): + """ + Normalize the split_time_series by max_min_normalization. + + :param split_time_series: [[data_c_left], [data_c_right], [data_b_left], [data_b_right], [data_a]] + :return: max_min_normalized time_series + """ + time_series = split_time_series[0] + split_time_series[1][1:] + split_time_series[2] + split_time_series[3][1:] + split_time_series[4] + max_value = np.max(time_series) + min_value = np.min(time_series) + if max_value - min_value > 0: + normalized_time_series = list((np.array(time_series) - min_value) / float(max_value - min_value)) + + return normalized_time_series diff --git a/time_series_detector/feature/classification_features.py b/time_series_detector/feature/classification_features.py index 6d53dab..4df69bb 100644 --- a/time_series_detector/feature/classification_features.py +++ b/time_series_detector/feature/classification_features.py @@ -10,6 +10,8 @@ Unless required by applicable law or agreed to in writing, software distributed import numpy as np import tsfresh.feature_extraction.feature_calculators as ts_feature_calculators +from time_series_detector.common.tsd_common import DEFAULT_WINDOW, split_time_series +from statistical_features import time_series_mean, time_series_variance, time_series_standard_deviation, time_series_median def time_series_autocorrelation(x): @@ -35,6 +37,8 @@ def time_series_autocorrelation(x): :return type: float """ lag = int((len(x) - 3) / 5) + if np.sqrt(np.var(x)) < 1e-10: + return 0 return ts_feature_calculators.autocorrelation(x, lag) @@ -47,6 +51,8 @@ def time_series_coefficient_of_variation(x): :return: the value of this feature :return type: float """ + if np.sqrt(np.var(x)) < 1e-10: + return 0 return np.mean(x) / np.sqrt(np.var(x)) @@ -74,15 +80,132 @@ def time_series_binned_entropy(x): result.append(ts_feature_calculators.binned_entropy(x, value)) return result + +def time_series_value_distribution(x): + """ + Given buckets, calculate the percentage of elements in the whole time series + in different buckets + + :param x: normalized time series + :type x: pandas.Series + :return: the values of this feature + :return type: list + """ + thresholds = [0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1.0, 1.0] + return list(np.histogram(x, bins=thresholds)[0] / float(len(x))) + + +def time_series_daily_parts_value_distribution(x): + """ + Given buckets, calculate the percentage of elements in three subsequences + of the whole time series in different buckets + + :param x: normalized time series + :type x: pandas.Series + :return: the values of this feature + :return type: list + """ + thresholds = [0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1.0, 1.0] + split_value_list = split_time_series(x, DEFAULT_WINDOW) + data_c = split_value_list[0] + split_value_list[1][1:] + data_b = split_value_list[2] + split_value_list[3][1:] + data_a = split_value_list[4] + count_c = list(np.histogram(data_c, bins=thresholds)[0]) + count_b = list(np.histogram(data_b, bins=thresholds)[0]) + count_a = list(np.histogram(data_a, bins=thresholds)[0]) + return list(np.array(count_c) / float(len(data_c))) + list(np.array(count_b) / float(len(data_b))) + list(np.array(count_a) / float(len(data_a))) + + +def time_series_daily_parts_value_distribution_with_threshold(x): + """ + Split the whole time series into three parts: c, b, a. + Given a threshold = 0.01, return the percentage of elements of time series + which are less than threshold + + :param x: normalized time series + :type x: pandas.Series + :return: 6 values of this feature + :return type: list + """ + threshold = 0.01 + split_value_list = split_time_series(x, DEFAULT_WINDOW) + data_c = split_value_list[0] + split_value_list[1][1:] + data_b = split_value_list[2] + split_value_list[3][1:] + data_a = split_value_list[4] + + # the number of elements in time series which is less than threshold: + nparray_data_c_threshold = np.array(data_c) + nparray_data_c_threshold[nparray_data_c_threshold < threshold] = -1 + nparray_data_b_threshold = np.array(data_b) + nparray_data_b_threshold[nparray_data_b_threshold < threshold] = -1 + nparray_data_a_threshold = np.array(data_a) + nparray_data_a_threshold[nparray_data_a_threshold < threshold] = -1 + + # the total number of elements in time series which is less than threshold: + nparray_threshold_count = (nparray_data_c_threshold == -1).sum() + (nparray_data_b_threshold == -1).sum() + (nparray_data_a_threshold == -1).sum() + + if nparray_threshold_count == 0: + features = [0, 0, 0] + else: + features = [ + (nparray_data_c_threshold == -1).sum() / float(nparray_threshold_count), + (nparray_data_b_threshold == -1).sum() / float(nparray_threshold_count), + (nparray_data_a_threshold == -1).sum() / float(nparray_threshold_count) + ] + + features.extend([ + (nparray_data_c_threshold == -1).sum() / float(len(data_c)), + (nparray_data_b_threshold == -1).sum() / float(len(data_b)), + (nparray_data_a_threshold == -1).sum() / float(len(data_a)) + ]) + return features + + +def time_series_window_parts_value_distribution_with_threshold(x): + """ + Split the whole time series into five parts. + Given a threshold = 0.01, return the percentage of elements of time series + which are less than threshold + + :param x: normalized time series + :type x: pandas.Series + :return: 5 values of this feature + :return type: list + """ + threshold = 0.01 + split_value_list = split_time_series(x, DEFAULT_WINDOW) + + count_list = [] + for value_list in split_value_list: + nparray_threshold = np.array(value_list) + nparray_threshold[nparray_threshold < threshold] = -1 + count_list.append((nparray_threshold == -1).sum()) + + if sum(count_list) == 0: + features = [0, 0, 0, 0, 0] + else: + features = list(np.array(count_list) / float((DEFAULT_WINDOW + 1))) + + return features + + # add yourself classification features here... def get_classification_features(x): classification_features = [ + time_series_mean(x), + time_series_variance(x), + time_series_standard_deviation(x), + time_series_median(x), time_series_autocorrelation(x), time_series_coefficient_of_variation(x) ] + classification_features.extend(time_series_value_distribution(x)) + classification_features.extend(time_series_daily_parts_value_distribution(x)) + classification_features.extend(time_series_daily_parts_value_distribution_with_threshold(x)) + classification_features.extend(time_series_window_parts_value_distribution_with_threshold(x)) classification_features.extend(time_series_binned_entropy(x)) - # append yourself classification features here... + # add yourself classification features here... return classification_features diff --git a/time_series_detector/feature/feature_service.py b/time_series_detector/feature/feature_service.py index 19379cc..5e8e237 100644 --- a/time_series_detector/feature/feature_service.py +++ b/time_series_detector/feature/feature_service.py @@ -34,9 +34,10 @@ def extract_features(time_series, window): split_time_series = tsd_common.split_time_series(time_series, window) # nomalize time_series normalized_split_time_series = tsd_common.normalize_time_series(split_time_series) + max_min_normalized_time_series = tsd_common.normalize_time_series_by_max_min(split_time_series) s_features = statistical_features.get_statistical_features(normalized_split_time_series[4]) f_features = fitting_features.get_fitting_features(normalized_split_time_series) - c_features = classification_features.get_classification_features(normalized_split_time_series[0] + normalized_split_time_series[1][1:] + normalized_split_time_series[2] + normalized_split_time_series[3][1:] + normalized_split_time_series[4]) + c_features = classification_features.get_classification_features(max_min_normalized_time_series) # combine features with types features = s_features + f_features + c_features return features diff --git a/time_series_detector/feature/fitting_features.py b/time_series_detector/feature/fitting_features.py index 9a146b6..d78702e 100644 --- a/time_series_detector/feature/fitting_features.py +++ b/time_series_detector/feature/fitting_features.py @@ -49,7 +49,7 @@ def time_series_weighted_moving_average(x): for w in range(1, min(50, DEFAULT_WINDOW), 5): w = min(len(x), w) # avoid the case len(value_list) < w coefficient = np.array(range(1, w + 1)) - temp_list.append((np.dot(coefficient, x[-w:])) / (w * (w + 1) / 2)) + temp_list.append((np.dot(coefficient, x[-w:])) / float(w * (w + 1) / 2)) return list(np.array(temp_list) - x[-1]) @@ -210,6 +210,11 @@ def time_series_periodic_features(data_c_left, data_c_right, data_b_left, data_b periodic_features.append(-1) else: periodic_features.append(1) + + step = DEFAULT_WINDOW / 6 + for w in range(1, DEFAULT_WINDOW, step): + periodic_features.append(min(max(data_a[w - 1:w + step]) - data_a[-1], 0)) + periodic_features.append(max(min(data_a[w - 1:w + step]) - data_a[-1], 0)) return periodic_features # add yourself fitting features here... diff --git a/time_series_detector/model/xgb_default_model b/time_series_detector/model/xgb_default_model index 362a7d9..7547402 100644 Binary files a/time_series_detector/model/xgb_default_model and b/time_series_detector/model/xgb_default_model differ