From fc2f82463a741a61276cfdb123373d614ba3a1d9 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Tue, 26 Mar 2024 09:49:50 +0100 Subject: [PATCH 01/38] included rsast --- .../shapelet_based/_rsast_classifier.py | 200 +++++ .../collection/shapelet_based/_rsast.py | 685 ++++++++++++++++++ 2 files changed, 885 insertions(+) create mode 100644 aeon/classification/shapelet_based/_rsast_classifier.py create mode 100644 aeon/transformations/collection/shapelet_based/_rsast.py diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py new file mode 100644 index 0000000000..f71e03328e --- /dev/null +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -0,0 +1,200 @@ +"""Random Scalable and Accurate Subsequence Transform (RSAST). + +Pipeline classifier using the RSAST transformer and an sklearn classifier. +""" + +__maintainer__ = [] +__all__ = ["RSASTClassifier"] + +from operator import itemgetter + +import numpy as np +from sklearn.linear_model import RidgeClassifierCV +from sklearn.pipeline import make_pipeline + +from aeon.base._base import _clone_estimator +from aeon.classification import BaseClassifier +from aeon.transformations.collection.shapelet_based import RSAST +from aeon.utils.numba.general import z_normalise_series + + +class RSASTClassifier(BaseClassifier): + """Classification pipeline using RSAST [1]_ transformer and an sklean classifier. + + Parameters + ---------- + n_random_points: int default = 10 the number of initial random points to extract + len_method: string default="both" the type of statistical tool used to get the length of shapelets. "both"=ACF&PACF, "ACF"=ACF, "PACF"=PACF, "None"=Extract randomly any length from the TS + nb_inst_per_class : int default = 10 + the number of reference time series to select per class + seed : int, default = None + the seed of the random generator + classifier : sklearn compatible classifier, default = None + if None, a RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)) is used. + n_jobs : int, default -1 + Number of threads to use for the transform. + + + Reference + --------- + .. [1] ... + "..." + ... + + Examples + -------- + >>> from aeon.classification.shapelet_based import SASTClassifier + >>> from aeon.datasets import load_unit_test + >>> X_train, y_train = load_unit_test(split="train") + >>> X_test, y_test = load_unit_test(split="test") + >>> clf = SASTClassifier() + >>> clf.fit(X_train, y_train) + SASTClassifier(...) + >>> y_pred = clf.predict(X_test) + """ + + _tags = { + "capability:multithreading": True, + "capability:multivariate": False, + "algorithm_type": "subsequence", + } + + def __init__( + self, + n_random_points=10, + len_method="both", + nb_inst_per_class=10, + seed=None, + classifier=None, + n_jobs=-1, + ): + super().__init__() + self.n_random_points=n_random_points, + self.len_method=len_method, + self.nb_inst_per_class = nb_inst_per_class + self.n_jobs = n_jobs + self.seed = seed + + self.classifier = classifier + + def _fit(self, X, y): + """Fit RSASTClassifier to the training data. + + Parameters + ---------- + X: np.ndarray shape (n_cases, n_channels, n_timepoints) + The training input samples. + y: array-like or list + The class values for X. + + Return + ------ + self : RSASTClassifier + This pipeline classifier + + """ + self._transformer = RSAST( + self.n_random_points=n_random_points, + self.len_method=len_method, + self.nb_inst_per_class, + self.seed, + self.n_jobs, + ) + + self._classifier = _clone_estimator( + ( + RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)) + if self.classifier is None + else self.classifier + ), + self.seed, + ) + + self._pipeline = make_pipeline(self._transformer, self._classifier) + + self._pipeline.fit(X, y) + + return self + + def _predict(self, X): + """Predict labels for the input. + + Parameters + ---------- + X: np.ndarray shape (n_cases, n_channels, n_timepoints) + The training input samples. + + Return + ------ + array-like or list + Predicted class labels. + """ + return self._pipeline.predict(X) + + def _predict_proba(self, X): + """Predict labels probabilities for the input. + + Parameters + ---------- + X: np.ndarray shape (n_cases, n_channels, n_timepoints) + The training input samples. + + Return + ------ + dists : np.ndarray shape (n_cases, n_timepoints) + Predicted class probabilities. + """ + m = getattr(self._classifier, "predict_proba", None) + if callable(m): + dists = self._pipeline.predict_proba(X) + else: + dists = np.zeros((X.shape[0], self.n_classes_)) + preds = self._pipeline.predict(X) + for i in range(0, X.shape[0]): + dists[i, np.where(self.classes_ == preds[i])] = 1 + return dists + + def plot_most_important_feature_on_ts(self, ts, feature_importance, limit=5): + """Plot the most important features on ts. + + Parameters + ---------- + ts : float[:] + The time series + feature_importance : float[:] + The importance of each feature in the transformed data + limit : int, default = 5 + The maximum number of features to plot + + Returns + ------- + fig : plt.figure + The figure + """ + import matplotlib.pyplot as plt + + features = zip(self._transformer._kernel_orig, feature_importance) + sorted_features = sorted(features, key=itemgetter(1), reverse=True) + + max_ = min(limit, len(sorted_features)) + + fig, axes = plt.subplots( + 1, max_, sharey=True, figsize=(3 * max_, 3), tight_layout=True + ) + + for f in range(max_): + kernel, _ = sorted_features[f] + znorm_kernel = z_normalise_series(kernel) + d_best = np.inf + for i in range(ts.size - kernel.size): + s = ts[i : i + kernel.size] + s = z_normalise_series(s) + d = np.sum((s - znorm_kernel) ** 2) + if d < d_best: + d_best = d + start_pos = i + axes[f].plot(range(start_pos, start_pos + kernel.size), kernel, linewidth=5) + axes[f].plot(range(ts.size), ts, linewidth=2) + axes[f].set_title(f"feature: {f+1}") + + return fig diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py new file mode 100644 index 0000000000..8a7a26782e --- /dev/null +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -0,0 +1,685 @@ +# -*- coding: utf-8 -*- +""" +Spyder Editor + +This is a temporary script file. +""" + +import numpy as np + +from sklearn.base import BaseEstimator, ClassifierMixin, clone +from sklearn.utils.validation import check_array, check_X_y, check_is_fitted + +from sklearn.ensemble import RandomForestClassifier, VotingClassifier + +from sklearn.linear_model import RidgeClassifierCV, LogisticRegressionCV, LogisticRegression, RidgeClassifier + + +from sklearn.linear_model._base import LinearClassifierMixin +from sklearn.pipeline import Pipeline + +#from sktime.utils.data_processing import from_2d_array_to_nested +#from sktime.transformations.panel.rocket import Rocket + +from numba import njit, prange + +#from mass_ts import * + +import pandas as pd + +from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning +from statsmodels.tsa.stattools import acf, pacf + +import time + +import os +from operator import itemgetter + + + +from utils_sast import from_2d_array_to_nested, znormalize_array, load_dataset, format_dataset, plot_most_important_features, plot_most_important_feature_on_ts, plot_most_important_feature_sast_on_ts +from aeon.classification.shapelet_based import RDSTClassifier +#from sktime.datasets import load_UCR_UEA_dataset + + + + + +@njit(fastmath=False) +def apply_kernel(ts, arr): + d_best = np.inf # sdist + m = ts.shape[0] + kernel = arr[~np.isnan(arr)] # ignore nan + + # profile = mass2(ts, kernel) + # d_best = np.min(profile) + + l = kernel.shape[0] + for i in range(m - l + 1): + d = np.sum((znormalize_array(ts[i:i+l]) - kernel)**2) + if d < d_best: + d_best = d + + return d_best + + +@njit(parallel=True, fastmath=True) +def apply_kernels(X, kernels): + nbk = len(kernels) + out = np.zeros((X.shape[0], nbk), dtype=np.float32) + for i in prange(nbk): + k = kernels[i] + for t in range(X.shape[0]): + ts = X[t] + out[t][i] = apply_kernel(ts, k) + return out + + +class SAST(BaseEstimator, ClassifierMixin): + + def __init__(self, cand_length_list, shp_step=1, nb_inst_per_class=1, random_state=None, classifier=None): + super(SAST, self).__init__() + self.cand_length_list = cand_length_list + self.shp_step = shp_step + self.nb_inst_per_class = nb_inst_per_class + self.kernels_ = None + self.kernel_orig_ = None # not z-normalized kernels + self.kernels_generators_ = {} + self.random_state = np.random.RandomState(random_state) if not isinstance( + random_state, np.random.RandomState) else random_state + + self.classifier = classifier + + def get_params(self, deep=True): + return { + 'cand_length_list': self.cand_length_list, + 'shp_step': self.shp_step, + 'nb_inst_per_class': self.nb_inst_per_class, + 'classifier': self.classifier + } + + def init_sast(self, X, y): + + self.cand_length_list = np.array(sorted(self.cand_length_list)) + + assert self.cand_length_list.ndim == 1, 'Invalid shapelet length list: required list or tuple, or a 1d numpy array' + + if self.classifier is None: + self.classifier = RandomForestClassifier( + min_impurity_decrease=0.05, max_features=None) + + classes = np.unique(y) + self.num_classes = classes.shape[0] + + candidates_ts = [] + for c in classes: + X_c = X[y == c] + + # convert to int because if self.nb_inst_per_class is float, the result of np.min() will be float + cnt = np.min([self.nb_inst_per_class, X_c.shape[0]]).astype(int) + choosen = self.random_state.permutation(X_c.shape[0])[:cnt] + candidates_ts.append(X_c[choosen]) + self.kernels_generators_[c] = X_c[choosen] + + candidates_ts = np.concatenate(candidates_ts, axis=0) + + self.cand_length_list = self.cand_length_list[self.cand_length_list <= X.shape[1]] + + max_shp_length = max(self.cand_length_list) + + n, m = candidates_ts.shape + + n_kernels = n * np.sum([m - l + 1 for l in self.cand_length_list]) + + self.kernels_ = np.full( + (n_kernels, max_shp_length), dtype=np.float32, fill_value=np.nan) + self.kernel_orig_ = [] + + k = 0 + + for shp_length in self.cand_length_list: + for i in range(candidates_ts.shape[0]): + for j in range(0, candidates_ts.shape[1] - shp_length + 1, self.shp_step): + end = j + shp_length + can = np.squeeze(candidates_ts[i][j: end]) + self.kernel_orig_.append(can) + self.kernels_[k, :shp_length] = znormalize_array(can) + + k += 1 + + def fit(self, X, y): + + X, y = check_X_y(X, y) # check the shape of the data + + # randomly choose reference time series and generate kernels + self.init_sast(X, y) + + # subsequence transform of X + X_transformed = apply_kernels(X, self.kernels_) + + self.classifier.fit(X_transformed, y) # fit the classifier + + return self + + def predict(self, X): + + check_is_fitted(self) # make sure the classifier is fitted + + X = check_array(X) # validate the shape of X + + # subsequence transform of X + X_transformed = apply_kernels(X, self.kernels_) + + return self.classifier.predict(X_transformed) + + def predict_proba(self, X): + check_is_fitted(self) # make sure the classifier is fitted + + X = check_array(X) # validate the shape of X + + # subsequence transform of X + X_transformed = apply_kernels(X, self.kernels_) + + if isinstance(self.classifier, LinearClassifierMixin): + return self.classifier._predict_proba_lr(X_transformed) + return self.classifier.predict_proba(X_transformed) + + +class SASTEnsemble(BaseEstimator, ClassifierMixin): + + def __init__(self, cand_length_list, shp_step=1, nb_inst_per_class=1, random_state=None, classifier=None, weights=None, n_jobs=None): + super(SASTEnsemble, self).__init__() + self.cand_length_list = cand_length_list + self.shp_step = shp_step + self.nb_inst_per_class = nb_inst_per_class + self.classifier = classifier + self.random_state = random_state + self.n_jobs = n_jobs + + self.saste = None + + self.weights = weights + + assert isinstance(self.classifier, BaseEstimator) + + self.init_ensemble() + + def init_ensemble(self): + estimators = [] + for i, candidate_lengths in enumerate(self.cand_length_list): + clf = clone(self.classifier) + sast = SAST(cand_length_list=candidate_lengths, + nb_inst_per_class=self.nb_inst_per_class, + random_state=self.random_state, + shp_step=self.shp_step, + classifier=clf) + estimators.append((f'sast{i}', sast)) + + self.saste = VotingClassifier( + estimators=estimators, voting='soft', n_jobs=self.n_jobs, weights=self.weights) + + def fit(self, X, y): + self.saste.fit(X, y) + return self + + def predict(self, X): + return self.saste.predict(X) + + def predict_proba(self, X): + return self.saste.predict_proba(X) + + + +class RSAST(BaseEstimator, ClassifierMixin): + + def __init__(self,n_random_points=10, nb_inst_per_class=10, len_method="both", random_state=None, classifier=None, sel_inst_wrepl=False,sel_randp_wrepl=False, half_instance=False, half_len=False,n_shapelet_samples=None ): + super(RSAST, self).__init__() + self.n_random_points = n_random_points + self.nb_inst_per_class = nb_inst_per_class + self.len_method = len_method + self.random_state = np.random.RandomState(random_state) if not isinstance( + random_state, np.random.RandomState) else random_state + self.classifier = classifier + self.cand_length_list = None + self.kernels_ = None + self.kernel_orig_ = None # not z-normalized kernels + self.kernel_permutated_ = None + self.kernels_generators_ = None + self.class_generators_ = None + self.sel_inst_wrepl=sel_inst_wrepl + self.sel_randp_wrepl=sel_randp_wrepl + self.half_instance=half_instance + self.half_len=half_len + self.time_calculating_weights = None + self.time_creating_subsequences = None + self.time_transform_dataset = None + self.time_classifier = None + self.n_shapelet_samples =n_shapelet_samples + + def get_params(self, deep=True): + return { + 'len_method': self.len_method, + 'n_random_points': self.n_random_points, + 'nb_inst_per_class': self.nb_inst_per_class, + 'sel_inst_wrepl':self.sel_inst_wrepl, + 'sel_randp_wrepl':self.sel_randp_wrepl, + 'half_instance':self.half_instance, + 'half_len':self.half_len, + 'classifier': self.classifier, + 'cand_length_list': self.cand_length_list + } + + def init_sast(self, X, y): + #0- initialize variables and convert values in "y" to string + start = time.time() + y=np.asarray([str(x_s) for x_s in y]) + + self.cand_length_list = {} + self.kernel_orig_ = [] + self.kernels_generators_ = [] + self.class_generators_ = [] + + list_kernels =[] + + + + n = [] + classes = np.unique(y) + self.num_classes = classes.shape[0] + m_kernel = 0 + + #1--calculate ANOVA per each time t throught the lenght of the TS + for i in range (X.shape[1]): + statistic_per_class= {} + for c in classes: + assert len(X[np.where(y==c)[0]][:,i])> 0, 'Time t without values in TS' + + statistic_per_class[c]=X[np.where(y==c)[0]][:,i] + #print("statistic_per_class- i:"+str(i)+', c:'+str(c)) + #print(statistic_per_class[c].shape) + + + #print('Without pd series') + #print(statistic_per_class) + + statistic_per_class=pd.Series(statistic_per_class) + #statistic_per_class = list(statistic_per_class.values()) + # Calculate t-statistic and p-value + + try: + t_statistic, p_value = f_oneway(*statistic_per_class) + except DegenerateDataWarning or ConstantInputWarning: + p_value=np.nan + # Interpretation of the results + # if p_value < 0.05: " The means of the populations are significantly different." + #print('pvalue', str(p_value)) + if np.isnan(p_value): + n.append(0) + else: + n.append(1-p_value) + end = time.time() + self.time_calculating_weights = end-start + + + #2--calculate PACF and ACF for each TS chossen in each class + start = time.time() + for i, c in enumerate(classes): + X_c = X[y == c] + if self.half_instance==True: + cnt = np.max([X_c.shape[0]//2, 1]).astype(int) + self.nb_inst_per_class=cnt + else: + cnt = np.min([self.nb_inst_per_class, X_c.shape[0]]).astype(int) + #set if the selection of instances is with replacement (if false it is not posible to select the same intance more than one) + if self.sel_inst_wrepl ==False: + choosen = self.random_state.permutation(X_c.shape[0])[:cnt] + else: + choosen = self.random_state.choice(X_c.shape[0], cnt) + + + + + for rep, idx in enumerate(choosen): + self.cand_length_list[c+","+str(idx)+","+str(rep)] = [] + non_zero_acf=[] + if (self.len_method == "both" or self.len_method == "ACF" or self.len_method == "Max ACF") : + #2.1-- Compute Autorrelation per object + acf_val, acf_confint = acf(X_c[idx], nlags=len(X_c[idx])-1, alpha=.05) + prev_acf=0 + for j, conf in enumerate(acf_confint): + + if(3<=j and (0 < acf_confint[j][0] <= acf_confint[j][1] or acf_confint[j][0] <= acf_confint[j][1] < 0) ): + #Consider just the maximum ACF value + if prev_acf!=0 and self.len_method == "Max ACF": + non_zero_acf.remove(prev_acf) + self.cand_length_list[c+","+str(idx)+","+str(rep)].remove(prev_acf) + non_zero_acf.append(j) + self.cand_length_list[c+","+str(idx)+","+str(rep)].append(j) + prev_acf=j + + non_zero_pacf=[] + if (self.len_method == "both" or self.len_method == "PACF" or self.len_method == "Max PACF"): + #2.2 Compute Partial Autorrelation per object + pacf_val, pacf_confint = pacf(X_c[idx], method="ols", nlags=(len(X_c[idx])//2) - 1, alpha=.05) + prev_pacf=0 + for j, conf in enumerate(pacf_confint): + + if(3<=j and (0 < pacf_confint[j][0] <= pacf_confint[j][1] or pacf_confint[j][0] <= pacf_confint[j][1] < 0) ): + #Consider just the maximum PACF value + if prev_pacf!=0 and self.len_method == "Max PACF": + non_zero_pacf.remove(prev_pacf) + self.cand_length_list[c+","+str(idx)+","+str(rep)].remove(prev_pacf) + + non_zero_pacf.append(j) + self.cand_length_list[c+","+str(idx)+","+str(rep)].append(j) + prev_pacf=j + + if (self.len_method == "all"): + self.cand_length_list[c+","+str(idx)+","+str(rep)].extend(np.arange(3,1+ len(X_c[idx]))) + + #2.3-- Save the maximum autocorralated lag value as shapelet lenght + + if len(self.cand_length_list[c+","+str(idx)+","+str(rep)])==0: + #chose a random lenght using the lenght of the time series (added 1 since the range start in 0) + rand_value= self.random_state.choice(len(X_c[idx]), 1)[0]+1 + self.cand_length_list[c+","+str(idx)+","+str(rep)].extend([max(3,rand_value)]) + #elif len(non_zero_acf)==0: + #print("There is no AC in TS", idx, " of class ",c) + #elif len(non_zero_pacf)==0: + #print("There is no PAC in TS", idx, " of class ",c) + #else: + #print("There is AC and PAC in TS", idx, " of class ",c) + + #print("Kernel lenght list:",self.cand_length_list[c+","+str(idx)],"") + + #remove duplicates for the list of lenghts + self.cand_length_list[c+","+str(idx)+","+str(rep)]=list(set(self.cand_length_list[c+","+str(idx)+","+str(rep)])) + #print("Len list:"+str(self.cand_length_list[c+","+str(idx)+","+str(rep)])) + for max_shp_length in self.cand_length_list[c+","+str(idx)+","+str(rep)]: + + #2.4-- Choose randomly n_random_points point for a TS + #2.5-- calculate the weights of probabilities for a random point in a TS + if sum(n) == 0 : + # Determine equal weights of a random point point in TS is there are no significant points + # print('All p values in One way ANOVA are equal to 0') + weights = [1/len(n) for i in range(len(n))] + weights = weights[:len(X_c[idx])-max_shp_length +1]/np.sum(weights[:len(X_c[idx])-max_shp_length+1]) + else: + # Determine the weights of a random point point in TS (excluding points after n-l+1) + weights = n / np.sum(n) + weights = weights[:len(X_c[idx])-max_shp_length +1]/np.sum(weights[:len(X_c[idx])-max_shp_length+1]) + + if self.half_len==True: + self.n_random_points=np.max([len(X_c[idx])//2, 1]).astype(int) + + + if self.n_random_points > len(X_c[idx])-max_shp_length+1 and self.sel_randp_wrepl==False: + #set a upper limit for the posible of number of random points when selecting without replacement + limit_rpoint=len(X_c[idx])-max_shp_length+1 + rand_point_ts = self.random_state.choice(len(X_c[idx])-max_shp_length+1, limit_rpoint, p=weights, replace=self.sel_randp_wrepl) + #print("limit_rpoint:"+str(limit_rpoint)) + else: + rand_point_ts = self.random_state.choice(len(X_c[idx])-max_shp_length+1, self.n_random_points, p=weights, replace=self.sel_randp_wrepl) + #print("n_random_points:"+str(self.n_random_points)) + + #print("rpoints:"+str(rand_point_ts)) + + for i in rand_point_ts: + #2.6-- Extract the subsequence with that point + kernel = X_c[idx][i:i+max_shp_length].reshape(1,-1) + #print("kernel:"+str(kernel)) + if m_kernel n_samples (intances) + self.classifier=RidgeClassifierCV() + print("RidgeClassifierCV:"+str("size training")+str(X_transformed.shape[0])+"<="+" kernels"+str(X_transformed.shape[1])) + else: + print("LogisticRegression:"+str("size training")+str(X_transformed.shape[0])+">"+" kernels"+str(X_transformed.shape[1])) + self.classifier=LogisticRegression() + #self.classifier = RandomForestClassifier(min_impurity_decrease=0.05, max_features=None) + + start = time.time() + #print('X_transformed shape') + #print(X_transformed.shape) + #print('X_transformed') + #print(X_transformed) + + self.classifier.fit(X_transformed, y) # fit the classifier + end = time.time() + self.time_classifier = end-start + + return self + + def predict(self, X): + + check_is_fitted(self) # make sure the classifier is fitted + + X = check_array(X) # validate the shape of X + + # subsequence transform of X + X_transformed = apply_kernels(X, self.kernels_) + + return self.classifier.predict(X_transformed) + + def predict_proba(self, X): + check_is_fitted(self) # make sure the classifier is fitted + + X = check_array(X) # validate the shape of X + + # subsequence transform of X + X_transformed = apply_kernels(X, self.kernels_) + + if isinstance(self.classifier, LinearClassifierMixin): + return self.classifier._predict_proba_lr(X_transformed) + return self.classifier.predict_proba(X_transformed) + + +if __name__ == "__main__": + + ds='Chinatown' # Chosing a dataset from # Number of classes to consider + + rtype="numpy2D" + + #X_train, y_train = load_UCR_UEA_dataset(name=ds, split="train",extract_path="data", return_type=rtype) + + + #X_train=np.nan_to_num(X_train) + #y_train=np.nan_to_num(y_train) + + #X_test, y_test = load_UCR_UEA_dataset(name=ds, split="test", extract_path="data", return_type=rtype) + + #X_test=np.nan_to_num(X_test) + #y_test=np.nan_to_num(y_test) + #print('Format: load_UCR_UEA_dataset') + #print(X_train.shape) + #print(X_test.shape) + #print(y_train.shape) + #print(y_test.shape) + + + #y_train = list(map(int, y_train)) + #y_test =list(map(int, y_test)) + #print(X_train[0]) + + """ + print("ds:"+ds) + X_train_mod=[] + for i , element in enumerate(X_train): + element=np.array(element[0]) + print("TS N:"+str(i)+" len:"+str(element.shape)) + #print(element) + X_train_mod.append(element) + + X_train_mod= np.array(X_train_mod) + print(X_train_mod.shape) + + X_train_mod=np.nan_to_num(X_train_mod) + """ + + path=r"C:\Users\Surface pro\random_sast\sast\data" + ds_train_lds , ds_test_lds = load_dataset(ds_folder=path,ds_name=ds,shuffle=False) + X_test_lds, y_test_lds = format_dataset(ds_test_lds) + X_train_lds, y_train_lds = format_dataset(ds_train_lds) + + X_train_lds=np.nan_to_num(X_train_lds) + y_train_lds=np.nan_to_num(y_train_lds) + X_test_lds=np.nan_to_num(X_test_lds) + y_test_lds=np.nan_to_num(y_test_lds) + + print('Format: load_dataset') + print(X_train_lds.shape) + print(X_train_lds[0].shape) + print(X_train_lds[1].shape) + print(X_test_lds.shape) + + + print(y_train_lds.shape) + print(y_test_lds.shape) + + + + + start = time.time() + random_state = None + rsast_ridge = RSAST(n_random_points=10, nb_inst_per_class=10, len_method="both") + rsast_ridge.fit(X_train_lds, y_train_lds) + end = time.time() + print('rsast score :', rsast_ridge.score(X_test_lds, y_test_lds)) + print('duration:', end-start) + print('params:', rsast_ridge.get_params()) + + #print('classifier:',rsast_ridge.classifier.coef_[0]) + + #fname = f'images/chinatown-rf-class{c}-top5-features-on-ref-ts.jpg' + #print(f"ts.shape{pd.array(rsast_ridge.kernels_generators_).shape}") + #print(f"kernel_d.shape{pd.array(rsast_ridge.kernel_orig_).shape}") + + plot_most_important_feature_on_ts(set_ts=rsast_ridge.kernels_generators_, labels=rsast_ridge.class_generators_, features=rsast_ridge.kernel_orig_, scores=rsast_ridge.classifier.coef_[0], limit=3, offset=0,znormalized=False) + + plot_most_important_features(rsast_ridge.kernel_orig_, rsast_ridge.classifier.coef_[0], limit=3,scale_color=False) + + X_train = X_train_lds[:, np.newaxis, :] + X_test = X_test_lds[:, np.newaxis, :] + y_train=np.asarray([int(x_s) for x_s in y_train_lds]) + y_test=np.asarray([int(x_s) for x_s in y_test_lds]) + start = time.time() + + rdst = RDSTClassifier( + max_shapelets=4, + shapelet_lengths=[7], + proba_normalization=0, + save_transformed_data=True + ) + rdst = RDSTClassifier(proba_normalization=0) + rdst.fit(X_train, y_train) + end = time.time() + + + + print('rdst score :', rdst.score(X_test, y_test)) + print('duration:', end-start) + print('params:', rdst.get_params()) + """ + for i, shp in enumerate(rdst._transformer.shapelets_[0].squeeze()): + print('rdst shapelet values:',str(i+1)," shape:", shp.shape," shapelet:", shp ) + + for i, dilation in enumerate(rdst._transformer.shapelets_[2].squeeze()): + print('rdst dilation parameter:',str(i+1)," shape:", shp.shape," dilation:", dilation ) + + for i, treshold in enumerate(rdst._transformer.shapelets_[3].squeeze()): + print('rdst treshold parameter:',str(i+1)," shape:", shp.shape," treshold:", treshold ) + + for i, normalization in enumerate(rdst._transformer.shapelets_[4].squeeze()): + print('rdst normalization parameter:',str(i+1)," shape:", shp.shape," normalization:", normalization ) + + for i, coef in enumerate(rdst._estimator["ridgeclassifiercv"].coef_): + print('rdst coef:',str(i+1)," shape:", coef.shape," coef:", coef ) + """ + + features_cl=rdst._transformer.shapelets_[0].squeeze() + dilations_cl=rdst._transformer.shapelets_[2].squeeze() + + coef_cl=rdst._estimator["ridgeclassifiercv"].coef_[0] + features_cl=[a for a in features_cl for i in range(3)] + dilations_cl=[a for a in dilations_cl for i in range(3)] + type_features_cl=["min","argmin","SO"]*len(features_cl) + + for l in pd.unique(rsast_ridge.class_generators_): + + all=zip(rsast_ridge.kernels_generators_,rsast_ridge.class_generators_) + + ts_cl=list(filter(lambda x: x[1]==l,all))[0][0] + ts_cl=[ts_cl for i in range(len(features_cl))] + labels=[l for i in range(len(features_cl))] + plot_most_important_feature_on_ts(set_ts=ts_cl, labels=labels, features=features_cl, scores=coef_cl,dilations=dilations_cl,type_features=type_features_cl, limit=3, offset=0,znormalized=False) + plot_most_important_features(features_cl, coef_cl, dilations=dilations_cl, limit=3, scale_color=False) + """ + min_shp_length = 3 + max_shp_length = X_train_lds.shape[1] + candidate_lengths = np.arange(min_shp_length, max_shp_length+1) + # candidate_lengths = (3, 7, 9, 11) + nb_inst_per_class = 1 + ridge = RidgeClassifierCV(alphas = np.logspace(-3, 3, 10)) + + start = time.time() + random_state = None + sast_ridge = SAST(cand_length_list=candidate_lengths, + nb_inst_per_class=nb_inst_per_class, + random_state=random_state, classifier=ridge) + sast_ridge.fit(X_train_lds, y_train_lds) + end = time.time() + print('sast score :', sast_ridge.score(X_test_lds, y_test_lds)) + print('duration:', end-start) + print('params:', sast_ridge.get_params()) + #print('classifier:',rsast_ridge.classifier.coef_[0]) + + #fname = f'images/chinatown-rf-class{c}-top5-features-on-ref-ts.jpg' + #print(f"ts.shape{pd.array(rsast_ridge.kernels_generators_).shape}") + #print(f"kernel_d.shape{pd.array(rsast_ridge.kernel_orig_).shape}") + for c, ts in sast_ridge.kernels_generators_.items(): + plot_most_important_feature_sast_on_ts(ts.squeeze(), c, sast_ridge.kernel_orig_, sast_ridge.classifier.coef_[0], limit=3, offset=0) # plot only the first model one-vs-all model's features + """ From deeddbf8e9e8b6b8bf95d283428c46f603ae3d91 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Fri, 29 Mar 2024 13:48:02 +0100 Subject: [PATCH 02/38] updated transformer and classifier --- .../classification/shapelet_based/__init__.py | 2 + .../shapelet_based/_rsast_classifier.py | 22 +- .../collection/shapelet_based/__init__.py | 3 +- .../collection/shapelet_based/_rsast.py | 428 +++--------------- 4 files changed, 81 insertions(+), 374 deletions(-) diff --git a/aeon/classification/shapelet_based/__init__.py b/aeon/classification/shapelet_based/__init__.py index 6f5edda5ec..f810ae0259 100644 --- a/aeon/classification/shapelet_based/__init__.py +++ b/aeon/classification/shapelet_based/__init__.py @@ -5,9 +5,11 @@ "ShapeletTransformClassifier", "RDSTClassifier", "SASTClassifier", + "RSASTClassifier", ] from aeon.classification.shapelet_based._mrsqm import MrSQMClassifier from aeon.classification.shapelet_based._rdst import RDSTClassifier from aeon.classification.shapelet_based._sast_classifier import SASTClassifier +from aeon.classification.shapelet_based._rsast_classifier import RSASTClassifier from aeon.classification.shapelet_based._stc import ShapeletTransformClassifier diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index f71e03328e..4d7b800a44 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -37,19 +37,18 @@ class RSASTClassifier(BaseClassifier): Reference --------- - .. [1] ... - "..." - ... - + .. [1] Varela, N. R., Mbouopda, M. F., & Nguifo, E. M. (2023). RSAST: Sampling Shapelets for Time Series Classification. + https://hal.science/hal-04311309/ + Examples -------- - >>> from aeon.classification.shapelet_based import SASTClassifier + >>> from aeon.classification.shapelet_based import RSASTClassifier >>> from aeon.datasets import load_unit_test >>> X_train, y_train = load_unit_test(split="train") >>> X_test, y_test = load_unit_test(split="test") - >>> clf = SASTClassifier() + >>> clf = RSASTClassifier() >>> clf.fit(X_train, y_train) - SASTClassifier(...) + RSASTClassifier(...) >>> y_pred = clf.predict(X_test) """ @@ -69,12 +68,11 @@ def __init__( n_jobs=-1, ): super().__init__() - self.n_random_points=n_random_points, - self.len_method=len_method, + self.n_random_points = n_random_points, + self.len_method = len_method, self.nb_inst_per_class = nb_inst_per_class self.n_jobs = n_jobs self.seed = seed - self.classifier = classifier def _fit(self, X, y): @@ -94,8 +92,8 @@ def _fit(self, X, y): """ self._transformer = RSAST( - self.n_random_points=n_random_points, - self.len_method=len_method, + self.n_random_points, + self.len_method, self.nb_inst_per_class, self.seed, self.n_jobs, diff --git a/aeon/transformations/collection/shapelet_based/__init__.py b/aeon/transformations/collection/shapelet_based/__init__.py index 11ba7222a0..5b134c2c56 100644 --- a/aeon/transformations/collection/shapelet_based/__init__.py +++ b/aeon/transformations/collection/shapelet_based/__init__.py @@ -1,11 +1,12 @@ """Shapelet based transformers.""" -__all__ = ["RandomShapeletTransform", "RandomDilatedShapeletTransform", "SAST"] +__all__ = ["RandomShapeletTransform", "RandomDilatedShapeletTransform", "SAST", "RSAST" ] from aeon.transformations.collection.shapelet_based._dilated_shapelet_transform import ( RandomDilatedShapeletTransform, ) from aeon.transformations.collection.shapelet_based._sast import SAST +from aeon.transformations.collection.shapelet_based._rsast import RSAST from aeon.transformations.collection.shapelet_based._shapelet_transform import ( RandomShapeletTransform, ) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 8a7a26782e..cb4c247003 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -1,47 +1,9 @@ -# -*- coding: utf-8 -*- -""" -Spyder Editor - -This is a temporary script file. -""" - import numpy as np +from numba import get_num_threads, njit, prange, set_num_threads -from sklearn.base import BaseEstimator, ClassifierMixin, clone -from sklearn.utils.validation import check_array, check_X_y, check_is_fitted - -from sklearn.ensemble import RandomForestClassifier, VotingClassifier - -from sklearn.linear_model import RidgeClassifierCV, LogisticRegressionCV, LogisticRegression, RidgeClassifier - - -from sklearn.linear_model._base import LinearClassifierMixin -from sklearn.pipeline import Pipeline - -#from sktime.utils.data_processing import from_2d_array_to_nested -#from sktime.transformations.panel.rocket import Rocket - -from numba import njit, prange - -#from mass_ts import * - -import pandas as pd - -from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning -from statsmodels.tsa.stattools import acf, pacf - -import time - -import os -from operator import itemgetter - - - -from utils_sast import from_2d_array_to_nested, znormalize_array, load_dataset, format_dataset, plot_most_important_features, plot_most_important_feature_on_ts, plot_most_important_feature_sast_on_ts -from aeon.classification.shapelet_based import RDSTClassifier -#from sktime.datasets import load_UCR_UEA_dataset - - +from aeon.transformations.collection import BaseCollectionTransformer +from aeon.utils.numba.general import z_normalise_series +from aeon.utils.validation import check_n_jobs @@ -56,7 +18,7 @@ def apply_kernel(ts, arr): l = kernel.shape[0] for i in range(m - l + 1): - d = np.sum((znormalize_array(ts[i:i+l]) - kernel)**2) + d = np.sum((z_normalise_series(ts[i:i+l]) - kernel)**2) if d < d_best: d_best = d @@ -75,163 +37,73 @@ def apply_kernels(X, kernels): return out -class SAST(BaseEstimator, ClassifierMixin): - - def __init__(self, cand_length_list, shp_step=1, nb_inst_per_class=1, random_state=None, classifier=None): - super(SAST, self).__init__() - self.cand_length_list = cand_length_list - self.shp_step = shp_step - self.nb_inst_per_class = nb_inst_per_class - self.kernels_ = None - self.kernel_orig_ = None # not z-normalized kernels - self.kernels_generators_ = {} - self.random_state = np.random.RandomState(random_state) if not isinstance( - random_state, np.random.RandomState) else random_state - - self.classifier = classifier - - def get_params(self, deep=True): - return { - 'cand_length_list': self.cand_length_list, - 'shp_step': self.shp_step, - 'nb_inst_per_class': self.nb_inst_per_class, - 'classifier': self.classifier - } - - def init_sast(self, X, y): - - self.cand_length_list = np.array(sorted(self.cand_length_list)) - - assert self.cand_length_list.ndim == 1, 'Invalid shapelet length list: required list or tuple, or a 1d numpy array' - - if self.classifier is None: - self.classifier = RandomForestClassifier( - min_impurity_decrease=0.05, max_features=None) - - classes = np.unique(y) - self.num_classes = classes.shape[0] - - candidates_ts = [] - for c in classes: - X_c = X[y == c] - - # convert to int because if self.nb_inst_per_class is float, the result of np.min() will be float - cnt = np.min([self.nb_inst_per_class, X_c.shape[0]]).astype(int) - choosen = self.random_state.permutation(X_c.shape[0])[:cnt] - candidates_ts.append(X_c[choosen]) - self.kernels_generators_[c] = X_c[choosen] - - candidates_ts = np.concatenate(candidates_ts, axis=0) - - self.cand_length_list = self.cand_length_list[self.cand_length_list <= X.shape[1]] - - max_shp_length = max(self.cand_length_list) - - n, m = candidates_ts.shape - - n_kernels = n * np.sum([m - l + 1 for l in self.cand_length_list]) - - self.kernels_ = np.full( - (n_kernels, max_shp_length), dtype=np.float32, fill_value=np.nan) - self.kernel_orig_ = [] - - k = 0 - - for shp_length in self.cand_length_list: - for i in range(candidates_ts.shape[0]): - for j in range(0, candidates_ts.shape[1] - shp_length + 1, self.shp_step): - end = j + shp_length - can = np.squeeze(candidates_ts[i][j: end]) - self.kernel_orig_.append(can) - self.kernels_[k, :shp_length] = znormalize_array(can) - - k += 1 - - def fit(self, X, y): - - X, y = check_X_y(X, y) # check the shape of the data - - # randomly choose reference time series and generate kernels - self.init_sast(X, y) - - # subsequence transform of X - X_transformed = apply_kernels(X, self.kernels_) - - self.classifier.fit(X_transformed, y) # fit the classifier +class RSAST(BaseCollectionTransformer): + """Random Scalable and Accurate Subsequence Transform (SAST). - return self - - def predict(self, X): - - check_is_fitted(self) # make sure the classifier is fitted - - X = check_array(X) # validate the shape of X - - # subsequence transform of X - X_transformed = apply_kernels(X, self.kernels_) - - return self.classifier.predict(X_transformed) - - def predict_proba(self, X): - check_is_fitted(self) # make sure the classifier is fitted - - X = check_array(X) # validate the shape of X - - # subsequence transform of X - X_transformed = apply_kernels(X, self.kernels_) - - if isinstance(self.classifier, LinearClassifierMixin): - return self.classifier._predict_proba_lr(X_transformed) - return self.classifier.predict_proba(X_transformed) - - -class SASTEnsemble(BaseEstimator, ClassifierMixin): + RSAST [1] is based on SAST, it uses a stratified sampling strategy for subsequences selection but additionally takes into account certain + statistical criteria such as ANOVA, ACF, and PACF to further reduce the search space of shapelets. + + RSAST starts with the pre-computation of a list of weights, using ANOVA, which helps in the selection of initial points for + subsequences. Then randomly select k time series per class, which are used with an ACF and PACF, obtaining a set of highly correlated + lagged values. These values are used as potential lengths for the shapelets. Lastly, with a pre-defined number of admissible starting + points to sample, the shapelets are extracted and used to transform the original dataset, replacing each time series by the vector of + its distance to each subsequence. + + Parameters + ---------- + n_random_points: int default = 10 the number of initial random points to extract + len_method: string default="both" the type of statistical tool used to get the length of shapelets. "both"=ACF&PACF, "ACF"=ACF, "PACF"=PACF, "None"=Extract randomly any length from the TS + nb_inst_per_class : int default = 10 + the number of reference time series to select per class + seed : int, default = None + the seed of the random generator + classifier : sklearn compatible classifier, default = None + if None, a RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)) is used. + n_jobs : int, default -1 + Number of threads to use for the transform. + + Reference + --------- + .. [1] Varela, N. R., Mbouopda, M. F., & Nguifo, E. M. (2023). RSAST: Sampling Shapelets for Time Series Classification. + https://hal.science/hal-04311309/ + + + Examples + -------- + >>> from aeon.transformations.collection.shapelet_based import RSAST + >>> from aeon.datasets import load_unit_test + >>> X_train, y_train = load_unit_test(split="train") + >>> X_test, y_test = load_unit_test(split="test") + >>> rsast = RSAST() + >>> rsast.fit(X_train, y_train) + RSAST() + >>> X_train = rsast.transform(X_train) + >>> X_test = rsast.transform(X_test) - def __init__(self, cand_length_list, shp_step=1, nb_inst_per_class=1, random_state=None, classifier=None, weights=None, n_jobs=None): - super(SASTEnsemble, self).__init__() - self.cand_length_list = cand_length_list - self.shp_step = shp_step + """ + + _tags = { + "output_data_type": "Tabular", + "capability:multivariate": False, + "algorithm_type": "subsequence", + } + + def __init__( + self, + n_random_points=10, + len_method="both", + nb_inst_per_class=10, + seed=None, + n_jobs=-1, + ): + super().__init__() + self.n_random_points = n_random_points, + self.len_method = len_method, self.nb_inst_per_class = nb_inst_per_class - self.classifier = classifier - self.random_state = random_state self.n_jobs = n_jobs + self.seed = seed - self.saste = None - - self.weights = weights - - assert isinstance(self.classifier, BaseEstimator) - - self.init_ensemble() - - def init_ensemble(self): - estimators = [] - for i, candidate_lengths in enumerate(self.cand_length_list): - clf = clone(self.classifier) - sast = SAST(cand_length_list=candidate_lengths, - nb_inst_per_class=self.nb_inst_per_class, - random_state=self.random_state, - shp_step=self.shp_step, - classifier=clf) - estimators.append((f'sast{i}', sast)) - - self.saste = VotingClassifier( - estimators=estimators, voting='soft', n_jobs=self.n_jobs, weights=self.weights) - - def fit(self, X, y): - self.saste.fit(X, y) - return self - - def predict(self, X): - return self.saste.predict(X) - - def predict_proba(self, X): - return self.saste.predict_proba(X) - - - -class RSAST(BaseEstimator, ClassifierMixin): - + def __init__(self,n_random_points=10, nb_inst_per_class=10, len_method="both", random_state=None, classifier=None, sel_inst_wrepl=False,sel_randp_wrepl=False, half_instance=False, half_len=False,n_shapelet_samples=None ): super(RSAST, self).__init__() self.n_random_points = n_random_points @@ -517,169 +389,3 @@ def predict_proba(self, X): return self.classifier._predict_proba_lr(X_transformed) return self.classifier.predict_proba(X_transformed) - -if __name__ == "__main__": - - ds='Chinatown' # Chosing a dataset from # Number of classes to consider - - rtype="numpy2D" - - #X_train, y_train = load_UCR_UEA_dataset(name=ds, split="train",extract_path="data", return_type=rtype) - - - #X_train=np.nan_to_num(X_train) - #y_train=np.nan_to_num(y_train) - - #X_test, y_test = load_UCR_UEA_dataset(name=ds, split="test", extract_path="data", return_type=rtype) - - #X_test=np.nan_to_num(X_test) - #y_test=np.nan_to_num(y_test) - #print('Format: load_UCR_UEA_dataset') - #print(X_train.shape) - #print(X_test.shape) - #print(y_train.shape) - #print(y_test.shape) - - - #y_train = list(map(int, y_train)) - #y_test =list(map(int, y_test)) - #print(X_train[0]) - - """ - print("ds:"+ds) - X_train_mod=[] - for i , element in enumerate(X_train): - element=np.array(element[0]) - print("TS N:"+str(i)+" len:"+str(element.shape)) - #print(element) - X_train_mod.append(element) - - X_train_mod= np.array(X_train_mod) - print(X_train_mod.shape) - - X_train_mod=np.nan_to_num(X_train_mod) - """ - - path=r"C:\Users\Surface pro\random_sast\sast\data" - ds_train_lds , ds_test_lds = load_dataset(ds_folder=path,ds_name=ds,shuffle=False) - X_test_lds, y_test_lds = format_dataset(ds_test_lds) - X_train_lds, y_train_lds = format_dataset(ds_train_lds) - - X_train_lds=np.nan_to_num(X_train_lds) - y_train_lds=np.nan_to_num(y_train_lds) - X_test_lds=np.nan_to_num(X_test_lds) - y_test_lds=np.nan_to_num(y_test_lds) - - print('Format: load_dataset') - print(X_train_lds.shape) - print(X_train_lds[0].shape) - print(X_train_lds[1].shape) - print(X_test_lds.shape) - - - print(y_train_lds.shape) - print(y_test_lds.shape) - - - - - start = time.time() - random_state = None - rsast_ridge = RSAST(n_random_points=10, nb_inst_per_class=10, len_method="both") - rsast_ridge.fit(X_train_lds, y_train_lds) - end = time.time() - print('rsast score :', rsast_ridge.score(X_test_lds, y_test_lds)) - print('duration:', end-start) - print('params:', rsast_ridge.get_params()) - - #print('classifier:',rsast_ridge.classifier.coef_[0]) - - #fname = f'images/chinatown-rf-class{c}-top5-features-on-ref-ts.jpg' - #print(f"ts.shape{pd.array(rsast_ridge.kernels_generators_).shape}") - #print(f"kernel_d.shape{pd.array(rsast_ridge.kernel_orig_).shape}") - - plot_most_important_feature_on_ts(set_ts=rsast_ridge.kernels_generators_, labels=rsast_ridge.class_generators_, features=rsast_ridge.kernel_orig_, scores=rsast_ridge.classifier.coef_[0], limit=3, offset=0,znormalized=False) - - plot_most_important_features(rsast_ridge.kernel_orig_, rsast_ridge.classifier.coef_[0], limit=3,scale_color=False) - - X_train = X_train_lds[:, np.newaxis, :] - X_test = X_test_lds[:, np.newaxis, :] - y_train=np.asarray([int(x_s) for x_s in y_train_lds]) - y_test=np.asarray([int(x_s) for x_s in y_test_lds]) - start = time.time() - - rdst = RDSTClassifier( - max_shapelets=4, - shapelet_lengths=[7], - proba_normalization=0, - save_transformed_data=True - ) - rdst = RDSTClassifier(proba_normalization=0) - rdst.fit(X_train, y_train) - end = time.time() - - - - print('rdst score :', rdst.score(X_test, y_test)) - print('duration:', end-start) - print('params:', rdst.get_params()) - """ - for i, shp in enumerate(rdst._transformer.shapelets_[0].squeeze()): - print('rdst shapelet values:',str(i+1)," shape:", shp.shape," shapelet:", shp ) - - for i, dilation in enumerate(rdst._transformer.shapelets_[2].squeeze()): - print('rdst dilation parameter:',str(i+1)," shape:", shp.shape," dilation:", dilation ) - - for i, treshold in enumerate(rdst._transformer.shapelets_[3].squeeze()): - print('rdst treshold parameter:',str(i+1)," shape:", shp.shape," treshold:", treshold ) - - for i, normalization in enumerate(rdst._transformer.shapelets_[4].squeeze()): - print('rdst normalization parameter:',str(i+1)," shape:", shp.shape," normalization:", normalization ) - - for i, coef in enumerate(rdst._estimator["ridgeclassifiercv"].coef_): - print('rdst coef:',str(i+1)," shape:", coef.shape," coef:", coef ) - """ - - features_cl=rdst._transformer.shapelets_[0].squeeze() - dilations_cl=rdst._transformer.shapelets_[2].squeeze() - - coef_cl=rdst._estimator["ridgeclassifiercv"].coef_[0] - features_cl=[a for a in features_cl for i in range(3)] - dilations_cl=[a for a in dilations_cl for i in range(3)] - type_features_cl=["min","argmin","SO"]*len(features_cl) - - for l in pd.unique(rsast_ridge.class_generators_): - - all=zip(rsast_ridge.kernels_generators_,rsast_ridge.class_generators_) - - ts_cl=list(filter(lambda x: x[1]==l,all))[0][0] - ts_cl=[ts_cl for i in range(len(features_cl))] - labels=[l for i in range(len(features_cl))] - plot_most_important_feature_on_ts(set_ts=ts_cl, labels=labels, features=features_cl, scores=coef_cl,dilations=dilations_cl,type_features=type_features_cl, limit=3, offset=0,znormalized=False) - plot_most_important_features(features_cl, coef_cl, dilations=dilations_cl, limit=3, scale_color=False) - """ - min_shp_length = 3 - max_shp_length = X_train_lds.shape[1] - candidate_lengths = np.arange(min_shp_length, max_shp_length+1) - # candidate_lengths = (3, 7, 9, 11) - nb_inst_per_class = 1 - ridge = RidgeClassifierCV(alphas = np.logspace(-3, 3, 10)) - - start = time.time() - random_state = None - sast_ridge = SAST(cand_length_list=candidate_lengths, - nb_inst_per_class=nb_inst_per_class, - random_state=random_state, classifier=ridge) - sast_ridge.fit(X_train_lds, y_train_lds) - end = time.time() - print('sast score :', sast_ridge.score(X_test_lds, y_test_lds)) - print('duration:', end-start) - print('params:', sast_ridge.get_params()) - #print('classifier:',rsast_ridge.classifier.coef_[0]) - - #fname = f'images/chinatown-rf-class{c}-top5-features-on-ref-ts.jpg' - #print(f"ts.shape{pd.array(rsast_ridge.kernels_generators_).shape}") - #print(f"kernel_d.shape{pd.array(rsast_ridge.kernel_orig_).shape}") - for c, ts in sast_ridge.kernels_generators_.items(): - plot_most_important_feature_sast_on_ts(ts.squeeze(), c, sast_ridge.kernel_orig_, sast_ridge.classifier.coef_[0], limit=3, offset=0) # plot only the first model one-vs-all model's features - """ From 2c0a41dc96f4ac04ffcdb4db239b8189242f0262 Mon Sep 17 00:00:00 2001 From: Nicolas Rojas Varela Date: Mon, 1 Apr 2024 17:51:50 +0200 Subject: [PATCH 03/38] included transformer and classifier --- .../collection/shapelet_based/_rsast.py | 215 +++---- .../shapelet_based (RSAST).ipynb | 523 ++++++++++++++++++ 2 files changed, 597 insertions(+), 141 deletions(-) create mode 100644 examples/classification/shapelet_based (RSAST).ipynb diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index cb4c247003..8ab7ad23f4 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -5,20 +5,20 @@ from aeon.utils.numba.general import z_normalise_series from aeon.utils.validation import check_n_jobs +from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning +from statsmodels.tsa.stattools import acf, pacf +import pandas as pd @njit(fastmath=False) -def apply_kernel(ts, arr): +def _apply_kernel(ts, arr): d_best = np.inf # sdist m = ts.shape[0] kernel = arr[~np.isnan(arr)] # ignore nan - # profile = mass2(ts, kernel) - # d_best = np.min(profile) - - l = kernel.shape[0] - for i in range(m - l + 1): - d = np.sum((z_normalise_series(ts[i:i+l]) - kernel)**2) + kernel_len = kernel.shape[0] + for i in range(m - kernel_len + 1): + d = np.sum((z_normalise_series(ts[i : i + kernel_len]) - kernel) ** 2) if d < d_best: d_best = d @@ -26,14 +26,14 @@ def apply_kernel(ts, arr): @njit(parallel=True, fastmath=True) -def apply_kernels(X, kernels): +def _apply_kernels(X, kernels): nbk = len(kernels) out = np.zeros((X.shape[0], nbk), dtype=np.float32) for i in prange(nbk): k = kernels[i] for t in range(X.shape[0]): ts = X[t] - out[t][i] = apply_kernel(ts, k) + out[t][i] = _apply_kernel(ts, k) return out @@ -90,11 +90,11 @@ class RSAST(BaseCollectionTransformer): def __init__( self, - n_random_points=10, - len_method="both", - nb_inst_per_class=10, - seed=None, - n_jobs=-1, + n_random_points = 10, + len_method = "both", + nb_inst_per_class = 10, + seed = None, + n_jobs = -1, ): super().__init__() self.n_random_points = n_random_points, @@ -102,48 +102,29 @@ def __init__( self.nb_inst_per_class = nb_inst_per_class self.n_jobs = n_jobs self.seed = seed - - - def __init__(self,n_random_points=10, nb_inst_per_class=10, len_method="both", random_state=None, classifier=None, sel_inst_wrepl=False,sel_randp_wrepl=False, half_instance=False, half_len=False,n_shapelet_samples=None ): - super(RSAST, self).__init__() - self.n_random_points = n_random_points - self.nb_inst_per_class = nb_inst_per_class - self.len_method = len_method - self.random_state = np.random.RandomState(random_state) if not isinstance( - random_state, np.random.RandomState) else random_state - self.classifier = classifier - self.cand_length_list = None - self.kernels_ = None - self.kernel_orig_ = None # not z-normalized kernels - self.kernel_permutated_ = None - self.kernels_generators_ = None - self.class_generators_ = None - self.sel_inst_wrepl=sel_inst_wrepl - self.sel_randp_wrepl=sel_randp_wrepl - self.half_instance=half_instance - self.half_len=half_len - self.time_calculating_weights = None - self.time_creating_subsequences = None - self.time_transform_dataset = None - self.time_classifier = None - self.n_shapelet_samples =n_shapelet_samples - - def get_params(self, deep=True): - return { - 'len_method': self.len_method, - 'n_random_points': self.n_random_points, - 'nb_inst_per_class': self.nb_inst_per_class, - 'sel_inst_wrepl':self.sel_inst_wrepl, - 'sel_randp_wrepl':self.sel_randp_wrepl, - 'half_instance':self.half_instance, - 'half_len':self.half_len, - 'classifier': self.classifier, - 'cand_length_list': self.cand_length_list - } - - def init_sast(self, X, y): - #0- initialize variables and convert values in "y" to string - start = time.time() + self._kernels = None # z-normalized subsequences + self._kernel_orig = None # non z-normalized subsequences + self._kernels_generators = {} # Reference time series + self._cand_length_list = None + + def _fit(self, X, y): + """Select reference time series and generate subsequences from them. + + Parameters + ---------- + X: np.ndarray shape (n_cases, n_channels, n_timepoints) + The training input samples. + y: array-like or list + The class values for X. + + Return + ------ + self : RSAST + This transformer + + """ + #0- initialize variables and convert values in "y" to string + y=np.asarray([str(x_s) for x_s in y]) self.cand_length_list = {} @@ -189,27 +170,19 @@ def init_sast(self, X, y): n.append(0) else: n.append(1-p_value) - end = time.time() - self.time_calculating_weights = end-start + + #2--calculate PACF and ACF for each TS chossen in each class - start = time.time() + for i, c in enumerate(classes): X_c = X[y == c] - if self.half_instance==True: - cnt = np.max([X_c.shape[0]//2, 1]).astype(int) - self.nb_inst_per_class=cnt - else: - cnt = np.min([self.nb_inst_per_class, X_c.shape[0]]).astype(int) + + cnt = np.min([self.nb_inst_per_class, X_c.shape[0]]).astype(int) #set if the selection of instances is with replacement (if false it is not posible to select the same intance more than one) - if self.sel_inst_wrepl ==False: - choosen = self.random_state.permutation(X_c.shape[0])[:cnt] - else: - choosen = self.random_state.choice(X_c.shape[0], cnt) - - - + + choosen = self.random_state.permutation(X_c.shape[0])[:cnt] for rep, idx in enumerate(choosen): self.cand_length_list[c+","+str(idx)+","+str(rep)] = [] @@ -281,20 +254,18 @@ def init_sast(self, X, y): weights = n / np.sum(n) weights = weights[:len(X_c[idx])-max_shp_length +1]/np.sum(weights[:len(X_c[idx])-max_shp_length+1]) - if self.half_len==True: - self.n_random_points=np.max([len(X_c[idx])//2, 1]).astype(int) - if self.n_random_points > len(X_c[idx])-max_shp_length+1 and self.sel_randp_wrepl==False: + if self.n_random_points > len(X_c[idx])-max_shp_length+1 : #set a upper limit for the posible of number of random points when selecting without replacement limit_rpoint=len(X_c[idx])-max_shp_length+1 - rand_point_ts = self.random_state.choice(len(X_c[idx])-max_shp_length+1, limit_rpoint, p=weights, replace=self.sel_randp_wrepl) + rand_point_ts = self.random_state.choice(len(X_c[idx])-max_shp_length+1, limit_rpoint, p=weights, replace=False) #print("limit_rpoint:"+str(limit_rpoint)) else: - rand_point_ts = self.random_state.choice(len(X_c[idx])-max_shp_length+1, self.n_random_points, p=weights, replace=self.sel_randp_wrepl) - #print("n_random_points:"+str(self.n_random_points)) + rand_point_ts = self.random_state.choice(len(X_c[idx])-max_shp_length+1, self.n_random_points, p=weights, replace=False) + + - #print("rpoints:"+str(rand_point_ts)) for i in rand_point_ts: #2.6-- Extract the subsequence with that point @@ -309,83 +280,45 @@ def init_sast(self, X, y): print("total kernels:"+str(len(self.kernel_orig_))) - if self.n_shapelet_samples!=None: - print("Truncated to:"+str(self.n_shapelet_samples)) - - self.kernel_permutated_ = self.random_state.permutation(self.kernel_orig_)[:self.n_shapelet_samples] - else: - self.kernel_permutated_ = self.kernel_orig_ + #3--save the calculated subsequences - n_kernels = len (self.kernel_permutated_) + n_kernels = len (self.kernel_orig_) self.kernels_ = np.full( (n_kernels, m_kernel), dtype=np.float32, fill_value=np.nan) - for k, kernel in enumerate(self.kernel_permutated_): - self.kernels_[k, :len(kernel)] = znormalize_array(kernel) - - end = time.time() - self.time_creating_subsequences = end-start - - def fit(self, X, y): - - X, y = check_X_y(X, y) # check the shape of the data - - # randomly choose reference time series and generate kernels - self.init_sast(X, y) - - start = time.time() - # subsequence transform of X - X_transformed = apply_kernels(X, self.kernels_) - end = time.time() - self.transform_dataset = end-start - - if self.classifier is None: - - if X_transformed.shape[0]<=X_transformed.shape[1]: #n_features (kernels) > n_samples (intances) - self.classifier=RidgeClassifierCV() - print("RidgeClassifierCV:"+str("size training")+str(X_transformed.shape[0])+"<="+" kernels"+str(X_transformed.shape[1])) - else: - print("LogisticRegression:"+str("size training")+str(X_transformed.shape[0])+">"+" kernels"+str(X_transformed.shape[1])) - self.classifier=LogisticRegression() - #self.classifier = RandomForestClassifier(min_impurity_decrease=0.05, max_features=None) - - start = time.time() - #print('X_transformed shape') - #print(X_transformed.shape) - #print('X_transformed') - #print(X_transformed) - - self.classifier.fit(X_transformed, y) # fit the classifier - end = time.time() - self.time_classifier = end-start + for k, kernel in enumerate(self.kernel_orig_): + self.kernels_[k, :len(kernel)] = z_normalise_series(kernel) return self + + def _transform(self, X, y=None): + """Transform the input X using the generated subsequences. - def predict(self, X): - - check_is_fitted(self) # make sure the classifier is fitted - - X = check_array(X) # validate the shape of X - - # subsequence transform of X - X_transformed = apply_kernels(X, self.kernels_) - - return self.classifier.predict(X_transformed) + Parameters + ---------- + X: np.ndarray shape (n_cases, n_channels, n_timepoints) + The training input samples. + y: array-like or list + Ignored argument, interface compatibility - def predict_proba(self, X): - check_is_fitted(self) # make sure the classifier is fitted + Return + ------ + X_transformed: np.ndarray shape (n_cases, n_timepoints), + The transformed data + """ + X_ = np.reshape(X, (X.shape[0], X.shape[-1])) - X = check_array(X) # validate the shape of X + prev_threads = get_num_threads() - # subsequence transform of X - X_transformed = apply_kernels(X, self.kernels_) + n_jobs = check_n_jobs(self.n_jobs) - if isinstance(self.classifier, LinearClassifierMixin): - return self.classifier._predict_proba_lr(X_transformed) - return self.classifier.predict_proba(X_transformed) + set_num_threads(n_jobs) + X_transformed = _apply_kernels(X_, self._kernels) # subsequence transform of X + set_num_threads(prev_threads) + return X_transformed diff --git a/examples/classification/shapelet_based (RSAST).ipynb b/examples/classification/shapelet_based (RSAST).ipynb new file mode 100644 index 0000000000..afcdedc67b --- /dev/null +++ b/examples/classification/shapelet_based (RSAST).ipynb @@ -0,0 +1,523 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "# Shapelet based time series machine learning\n", + "\n", + "Shapelets a subsections of times series taken from the train data that are a useful for time series machine learning. They were first proposed ia primitive for machine learning [1][2] and were embedded in a decision tree for classification. The Shapelet Transform Classifier (STC)[3,4] is a pipeline classifier which searches the training data for shapelets, transforms series to vectors of distances to a filtered set of selected shapelets based on information gain, then builds a classifier on the latter.\n", + "\n", + "Finding shapelets involves selecting and evaluating shapelets. The original shapelet tree and STC performed a full enumeration of all possible shapelets before keeping the best ones. This is computationally inefficient, and modern shapelet based machine learning algorithms randomise the search." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(r'C:\\Users\\nicol\\aeon')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('MrSQMClassifier',\n", + " aeon.classification.shapelet_based._mrsqm.MrSQMClassifier),\n", + " ('RDSTClassifier', aeon.classification.shapelet_based._rdst.RDSTClassifier),\n", + " ('ShapeletTransformClassifier',\n", + " aeon.classification.shapelet_based._stc.ShapeletTransformClassifier)]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import warnings\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "from aeon.datasets import load_basic_motions\n", + "from aeon.registry import all_estimators\n", + "from aeon.transformations.collection.shapelet_based import RandomShapeletTransform\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "all_estimators(\"classifier\", filter_tags={\"algorithm_type\": \"shapelet\"})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "### Shapelet Transform for Classification\n", + "\n", + "The `RandomShapeletTransform` transformer takes a set of labelled training time series in the `fit` function, randomly samples `n_shapelet_samples` shapelets, keeping the best `max_shapelets`. The resulting shapelets are used in the `transform` function to create a new tabular dataset, where each row represents a time series instance, and each column stores the distance from a time series to a shapelet. The resulting tabular data can be used by any scikit learn compatible classifier. In this notebook we will explain these terms and describe how the algorithm works. But first we show it in action. We will use the BasicMotions data as an example. This data set contains time series of motion traces for the activities \"running\", \"walking\", \"standing\" and \"badminton\". The learning problem is to predict the activity given the time series. Each time series has six channels: x, y, z position and x, y, z accelerometer of the wrist. Data was recorded on a smart watch." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Shape of transformed data = (40, 8)\n", + " Distance of second series to third shapelet = 1.302772121165026\n", + " Shapelets + random forest acc = 0.95\n" + ] + } + ], + "source": [ + "X, y = load_basic_motions(split=\"train\")\n", + "rst = RandomShapeletTransform(n_shapelet_samples=100, max_shapelets=10, random_state=42)\n", + "st = rst.fit_transform(X, y)\n", + "print(\" Shape of transformed data = \", st.shape)\n", + "print(\" Distance of second series to third shapelet = \", st[1][2])\n", + "testX, testy = load_basic_motions(split=\"test\")\n", + "tr_test = rst.transform(testX)\n", + "rf = RandomForestClassifier(random_state=10)\n", + "rf.fit(st, y)\n", + "preds = rf.predict(tr_test)\n", + "print(\" Shapelets + random forest acc = \", accuracy_score(preds, testy))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "### Visualising Shapelets\n", + "The first column of the transformed data represents the distance from the first shapelet to each time series. The shapelets are sorted, so the first shapelet is the one we estimate is the best (using the calculation described below). You can recover the shapelets from the transform. Each shapelet is a 7-tuple, storing the following information:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quality = 0.81127812\n", + "Length = 39\n", + "position = 55\n", + "Channel = 0\n", + "Origin Instance Index = 11\n", + "Class label = running\n", + "Shapelet = [-0.85667017 -1.88711152 -0.8751295 0.80633757 1.10838333 0.69810992\n", + " 0.85713394 1.23190921 0.01801365 -1.29683966 -1.94694259 -0.37487726\n", + " -0.37487726 1.39471462 0.74922685 0.74922685 0.22343376 0.22343376\n", + " -0.7730703 -1.37591995 -0.80376393 1.32758071 0.99778845 0.6013481\n", + " 0.83711118 0.93684593 0.93684593 -1.30429475 -1.64522057 -0.56312308\n", + " 0.96855713 0.56796251 0.35714242 0.62066541 0.65135287 -0.80531237\n", + " -1.49170075 -1.18512797 0.69685753]\n" + ] + } + ], + "source": [ + "running_shapelet = rst.shapelets[0]\n", + "print(\"Quality = \", running_shapelet[0])\n", + "print(\"Length = \", running_shapelet[1])\n", + "print(\"position = \", running_shapelet[2])\n", + "print(\"Channel = \", running_shapelet[3])\n", + "print(\"Origin Instance Index = \", running_shapelet[4])\n", + "print(\"Class label = \", running_shapelet[5])\n", + "print(\"Shapelet = \", running_shapelet[6])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "We can directly extract shapelets and inspect them. These are the the two shapelets that are best at discriminating badminton and running against other activities. All shapelets are normalised to provide scale invariance." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Badminton shapelet from channel 0 (x-dimension) (0.65194393, 74, 7, 1, 1, 'standing', array([-5.27667376, -0.94911454, 0.90433173, 1.26316864, 2.34760078,\n", + " 1.84408 , 0.9192852 , 0.9192852 , -1.29868372, -1.29868372,\n", + " -1.5476774 , -1.03000413, 0.27593674, -0.70184658, 0.37460295,\n", + " 1.27398121, 1.02881837, 0.64543662, -0.0669839 , -0.54373096,\n", + " -0.55716134, -0.56605101, -0.08611633, 0.31270572, 0.25642625,\n", + " 0.5512744 , 0.78929504, 0.73385326, 0.73385326, -0.26777726,\n", + " -0.63967737, -0.63967737, -0.5539071 , -0.5539071 , 0.3867047 ,\n", + " 0.3867047 , 0.88832979, 0.85074214, 0.46901267, 0.0925433 ,\n", + " -0.34444436, -0.72498936, -0.83763127, -0.53034818, -0.05869122,\n", + " 0.46600593, 1.02537238, 0.81800526, 0.51709059, 0.17497366,\n", + " -0.31072836, -0.64876695, -0.89102368, -0.60834799, -0.0627886 ,\n", + " 0.42532723, 0.95696668, 0.91077086, 0.77491818, 0.14283377,\n", + " 0.14283377, -1.08722874, -1.08722874, -0.65706914, -0.65706914,\n", + " 0.28210933, 0.74159654, 0.8064869 , 0.8064869 , 0.19889294,\n", + " -0.16601048, -0.78706337, -0.76364317, -0.63789726]))\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "badminton_shapelet = rst.shapelets[4]\n", + "print(\" Badminton shapelet from channel 0 (x-dimension)\", badminton_shapelet)\n", + "plt.title(\"Best shapelets for running and badminton\")\n", + "plt.plot(badminton_shapelet[6], label=\"Badminton\")\n", + "plt.plot(running_shapelet[6], label=\"Running\")\n", + "plt.legend()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Both shapelets are in the x-axis, so represent side to side motion. Badminton is characterised by sa single large peak in one direction, capturing the drawing of the hand back and quickly hittig the shuttlcock. Running is chaaracterised by a longer repetition of side to side motions, with a sharper peak representing bringing the arm forward accross the body in a running motion." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## Performance on the UCR univariate datasets\n", + "\n", + "You can find the interval based classifiers as follows." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MrSQMClassifier\n", + "RDSTClassifier\n", + "ShapeletTransformClassifier\n" + ] + } + ], + "source": [ + "from aeon.registry import all_estimators\n", + "\n", + "est = [\"MrSQMClassifier\", \"RDSTClassifier\", \"ShapeletTransformClassifier\"]\n", + "for c in est:\n", + " print(c)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(112, 3)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from aeon.benchmarking import get_estimator_results_as_array\n", + "from aeon.datasets.tsc_data_lists import univariate\n", + "\n", + "names = [t.replace(\"Classifier\", \"\") for t in est]\n", + "results, present_names = get_estimator_results_as_array(\n", + " names, univariate, include_missing=False\n", + ")\n", + "results.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(
, )" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from aeon.visualisation import plot_boxplot_median, plot_critical_difference\n", + "\n", + "plot_critical_difference(results, names)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(
, )" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_boxplot_median(results, names)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scalable and Accurate Subsequence Transform (SAST)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false, + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "from aeon.datasets import load_basic_motions,load_classification\n", + "\n", + "from aeon.transformations.collection.shapelet_based import SAST\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Shape of transformed data = (20, 504)\n", + " Distance of second series to third shapelet = 0.0004470423\n", + " Shapelets + random forest acc = 1.0\n" + ] + } + ], + "source": [ + "#X, y = load_basic_motions(split=\"train\")\n", + "X, y = load_classification(name=\"Chinatown\",split=\"train\")\n", + "sast = SAST(lengths=None,\n", + " stride=1,\n", + " nb_inst_per_class=1,\n", + " seed=42,\n", + " n_jobs=-1)\n", + "st = sast.fit_transform(X, y)\n", + "print(\" Shape of transformed data = \", st.shape)\n", + "print(\" Distance of second series to third shapelet = \", st[1][2])\n", + "#testX, testy = load_basic_motions(split=\"test\")\n", + "testX, testy = load_classification(name=\"Chinatown\",split=\"train\")\n", + "tr_test = sast.transform(testX)\n", + "rf = RandomForestClassifier(random_state=10)\n", + "rf.fit(st, y)\n", + "preds = rf.predict(tr_test)\n", + "print(\" Shapelets + random forest acc = \", accuracy_score(preds, testy))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Random Scalable and Accurate Subsequence Transform (RSAST)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(r'C:\\Users\\nicol\\aeon')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false, + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "from aeon.datasets import load_basic_motions,load_classification\n", + "\n", + "from aeon.transformations.collection.shapelet_based import RSAST\n", + "\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[3], line 8\u001b[0m\n\u001b[0;32m 2\u001b[0m X, y \u001b[38;5;241m=\u001b[39m load_classification(name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mChinatown\u001b[39m\u001b[38;5;124m\"\u001b[39m,split\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrain\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 3\u001b[0m sast \u001b[38;5;241m=\u001b[39m RSAST(n_random_points\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m,\n\u001b[0;32m 4\u001b[0m len_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mboth\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 5\u001b[0m nb_inst_per_class\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m,\n\u001b[0;32m 6\u001b[0m seed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 7\u001b[0m n_jobs\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m----> 8\u001b[0m st \u001b[38;5;241m=\u001b[39m \u001b[43msast\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Shape of transformed data = \u001b[39m\u001b[38;5;124m\"\u001b[39m, st\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Distance of second series to third shapelet = \u001b[39m\u001b[38;5;124m\"\u001b[39m, st[\u001b[38;5;241m1\u001b[39m][\u001b[38;5;241m2\u001b[39m])\n", + "File \u001b[1;32m~\\aeon\\aeon\\transformations\\collection\\base.py:162\u001b[0m, in \u001b[0;36mBaseCollectionTransformer.fit_transform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 159\u001b[0m X_inner \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_preprocess_collection(X)\n\u001b[0;32m 160\u001b[0m y_inner \u001b[38;5;241m=\u001b[39m y\n\u001b[1;32m--> 162\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mX_inner\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my_inner\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 164\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_is_fitted \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m 166\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Xt\n", + "File \u001b[1;32m~\\aeon\\aeon\\transformations\\collection\\base.py:328\u001b[0m, in \u001b[0;36mBaseCollectionTransformer._fit_transform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 309\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit to data, then transform it.\u001b[39;00m\n\u001b[0;32m 310\u001b[0m \n\u001b[0;32m 311\u001b[0m \u001b[38;5;124;03mFits the transformer to X and y and returns a transformed version of X.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 324\u001b[0m \u001b[38;5;124;03mtransformed version of X.\u001b[39;00m\n\u001b[0;32m 325\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 326\u001b[0m \u001b[38;5;66;03m# Non-optimized default implementation; override when a better\u001b[39;00m\n\u001b[0;32m 327\u001b[0m \u001b[38;5;66;03m# method is possible for a given algorithm.\u001b[39;00m\n\u001b[1;32m--> 328\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 329\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_transform(X, y)\n", + "File \u001b[1;32m~\\aeon\\aeon\\transformations\\collection\\shapelet_based\\_rsast.py:169\u001b[0m, in \u001b[0;36mRSAST._fit\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 165\u001b[0m p_value\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39mnan\n\u001b[0;32m 166\u001b[0m \u001b[38;5;66;03m# Interpretation of the results\u001b[39;00m\n\u001b[0;32m 167\u001b[0m \u001b[38;5;66;03m# if p_value < 0.05: \" The means of the populations are significantly different.\"\u001b[39;00m\n\u001b[0;32m 168\u001b[0m \u001b[38;5;66;03m#print('pvalue', str(p_value))\u001b[39;00m\n\u001b[1;32m--> 169\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39misnan(p_value):\n\u001b[0;32m 170\u001b[0m n\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m 171\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[1;31mValueError\u001b[0m: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()" + ] + } + ], + "source": [ + "#X, y = load_basic_motions(split=\"train\")\n", + "X, y = load_classification(name=\"Chinatown\",split=\"train\")\n", + "sast = RSAST(n_random_points=10,\n", + " len_method=\"both\",\n", + " nb_inst_per_class=10,\n", + " seed=None,\n", + " n_jobs=-1)\n", + "st = sast.fit_transform(X, y)\n", + "print(\" Shape of transformed data = \", st.shape)\n", + "print(\" Distance of second series to third shapelet = \", st[1][2])\n", + "#testX, testy = load_basic_motions(split=\"test\")\n", + "testX, testy = load_classification(name=\"Chinatown\",split=\"train\")\n", + "tr_test = sast.transform(testX)\n", + "rf = RandomForestClassifier(random_state=10)\n", + "rf.fit(st, y)\n", + "preds = rf.predict(tr_test)\n", + "print(\" Shapelets + random forest acc = \", accuracy_score(preds, testy))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From ddd830a0da10382e6417461245a803bc2737f0ef Mon Sep 17 00:00:00 2001 From: Nicolas Rojas Varela Date: Tue, 2 Apr 2024 00:16:31 +0200 Subject: [PATCH 04/38] updated rsast tranformer --- .../collection/shapelet_based/_rsast.py | 83 +++++++++++++++---- .../shapelet_based (RSAST).ipynb | 28 +++---- 2 files changed, 77 insertions(+), 34 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 8ab7ad23f4..3bca11fef6 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -1,3 +1,6 @@ +import sys +sys.path.append(r'C:\Users\nicol\aeon') + import numpy as np from numba import get_num_threads, njit, prange, set_num_threads @@ -96,9 +99,8 @@ def __init__( seed = None, n_jobs = -1, ): - super().__init__() - self.n_random_points = n_random_points, - self.len_method = len_method, + self.n_random_points = n_random_points + self.len_method = len_method self.nb_inst_per_class = nb_inst_per_class self.n_jobs = n_jobs self.seed = seed @@ -106,6 +108,9 @@ def __init__( self._kernel_orig = None # non z-normalized subsequences self._kernels_generators = {} # Reference time series self._cand_length_list = None + super().__init__() + + def _fit(self, X, y): """Select reference time series and generate subsequences from them. @@ -123,9 +128,21 @@ def _fit(self, X, y): This transformer """ - #0- initialize variables and convert values in "y" to string - y=np.asarray([str(x_s) for x_s in y]) + #0- initialize variables and convert values in "y" to string + X_ = np.reshape(X, (X.shape[0], X.shape[-1])) + + self._random_state = ( + np.random.RandomState(self.seed) + if not isinstance(self.seed, np.random.RandomState) + else self.seed + ) + + classes = np.unique(y) + self._num_classes = classes.shape[0] + + candidates_ts = [] + y = np.asarray([str(x_s) for x_s in y]) self.cand_length_list = {} self.kernel_orig_ = [] @@ -142,12 +159,12 @@ def _fit(self, X, y): m_kernel = 0 #1--calculate ANOVA per each time t throught the lenght of the TS - for i in range (X.shape[1]): + for i in range (X_.shape[1]): statistic_per_class= {} for c in classes: - assert len(X[np.where(y==c)[0]][:,i])> 0, 'Time t without values in TS' + assert len(X_[np.where(y==c)[0]][:,i])> 0, 'Time t without values in TS' - statistic_per_class[c]=X[np.where(y==c)[0]][:,i] + statistic_per_class[c]=X_[np.where(y==c)[0]][:,i] #print("statistic_per_class- i:"+str(i)+', c:'+str(c)) #print(statistic_per_class[c].shape) @@ -162,7 +179,9 @@ def _fit(self, X, y): try: t_statistic, p_value = f_oneway(*statistic_per_class) except DegenerateDataWarning or ConstantInputWarning: - p_value=np.nan + p_value = np.nan + + #print('statistic_per_class', str(statistic_per_class)) # Interpretation of the results # if p_value < 0.05: " The means of the populations are significantly different." #print('pvalue', str(p_value)) @@ -177,12 +196,12 @@ def _fit(self, X, y): #2--calculate PACF and ACF for each TS chossen in each class for i, c in enumerate(classes): - X_c = X[y == c] + X_c = X_[y == c] cnt = np.min([self.nb_inst_per_class, X_c.shape[0]]).astype(int) #set if the selection of instances is with replacement (if false it is not posible to select the same intance more than one) - choosen = self.random_state.permutation(X_c.shape[0])[:cnt] + choosen = self._random_state.permutation(X_c.shape[0])[:cnt] for rep, idx in enumerate(choosen): self.cand_length_list[c+","+str(idx)+","+str(rep)] = [] @@ -226,7 +245,7 @@ def _fit(self, X, y): if len(self.cand_length_list[c+","+str(idx)+","+str(rep)])==0: #chose a random lenght using the lenght of the time series (added 1 since the range start in 0) - rand_value= self.random_state.choice(len(X_c[idx]), 1)[0]+1 + rand_value= self._random_state.choice(len(X_c[idx]), 1)[0]+1 self.cand_length_list[c+","+str(idx)+","+str(rep)].extend([max(3,rand_value)]) #elif len(non_zero_acf)==0: #print("There is no AC in TS", idx, " of class ",c) @@ -255,14 +274,14 @@ def _fit(self, X, y): weights = weights[:len(X_c[idx])-max_shp_length +1]/np.sum(weights[:len(X_c[idx])-max_shp_length+1]) - + if self.n_random_points > len(X_c[idx])-max_shp_length+1 : #set a upper limit for the posible of number of random points when selecting without replacement limit_rpoint=len(X_c[idx])-max_shp_length+1 - rand_point_ts = self.random_state.choice(len(X_c[idx])-max_shp_length+1, limit_rpoint, p=weights, replace=False) + rand_point_ts = self._random_state.choice(len(X_c[idx])-max_shp_length+1, limit_rpoint, p=weights, replace=False) #print("limit_rpoint:"+str(limit_rpoint)) else: - rand_point_ts = self.random_state.choice(len(X_c[idx])-max_shp_length+1, self.n_random_points, p=weights, replace=False) + rand_point_ts = self._random_state.choice(len(X_c[idx])-max_shp_length+1, self.n_random_points, p=weights, replace=False) @@ -271,7 +290,7 @@ def _fit(self, X, y): #2.6-- Extract the subsequence with that point kernel = X_c[idx][i:i+max_shp_length].reshape(1,-1) #print("kernel:"+str(kernel)) - if m_kernel 8\u001b[0m st \u001b[38;5;241m=\u001b[39m \u001b[43msast\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Shape of transformed data = \u001b[39m\u001b[38;5;124m\"\u001b[39m, st\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Distance of second series to third shapelet = \u001b[39m\u001b[38;5;124m\"\u001b[39m, st[\u001b[38;5;241m1\u001b[39m][\u001b[38;5;241m2\u001b[39m])\n", - "File \u001b[1;32m~\\aeon\\aeon\\transformations\\collection\\base.py:162\u001b[0m, in \u001b[0;36mBaseCollectionTransformer.fit_transform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 159\u001b[0m X_inner \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_preprocess_collection(X)\n\u001b[0;32m 160\u001b[0m y_inner \u001b[38;5;241m=\u001b[39m y\n\u001b[1;32m--> 162\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mX_inner\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my_inner\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 164\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_is_fitted \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m 166\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Xt\n", - "File \u001b[1;32m~\\aeon\\aeon\\transformations\\collection\\base.py:328\u001b[0m, in \u001b[0;36mBaseCollectionTransformer._fit_transform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 309\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit to data, then transform it.\u001b[39;00m\n\u001b[0;32m 310\u001b[0m \n\u001b[0;32m 311\u001b[0m \u001b[38;5;124;03mFits the transformer to X and y and returns a transformed version of X.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 324\u001b[0m \u001b[38;5;124;03mtransformed version of X.\u001b[39;00m\n\u001b[0;32m 325\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 326\u001b[0m \u001b[38;5;66;03m# Non-optimized default implementation; override when a better\u001b[39;00m\n\u001b[0;32m 327\u001b[0m \u001b[38;5;66;03m# method is possible for a given algorithm.\u001b[39;00m\n\u001b[1;32m--> 328\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 329\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_transform(X, y)\n", - "File \u001b[1;32m~\\aeon\\aeon\\transformations\\collection\\shapelet_based\\_rsast.py:169\u001b[0m, in \u001b[0;36mRSAST._fit\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 165\u001b[0m p_value\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39mnan\n\u001b[0;32m 166\u001b[0m \u001b[38;5;66;03m# Interpretation of the results\u001b[39;00m\n\u001b[0;32m 167\u001b[0m \u001b[38;5;66;03m# if p_value < 0.05: \" The means of the populations are significantly different.\"\u001b[39;00m\n\u001b[0;32m 168\u001b[0m \u001b[38;5;66;03m#print('pvalue', str(p_value))\u001b[39;00m\n\u001b[1;32m--> 169\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39misnan(p_value):\n\u001b[0;32m 170\u001b[0m n\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m 171\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", - "\u001b[1;31mValueError\u001b[0m: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()" + "name": "stdout", + "output_type": "stream", + "text": [ + "total kernels:570\n", + " Shape of transformed data = (20, 570)\n", + " Distance of second series to third shapelet = 0.16883065\n", + " Shapelets + random forest acc = 1.0\n" ] } ], "source": [ "#X, y = load_basic_motions(split=\"train\")\n", "X, y = load_classification(name=\"Chinatown\",split=\"train\")\n", - "sast = RSAST(n_random_points=10,\n", + "rsast = RSAST(n_random_points=10,\n", " len_method=\"both\",\n", " nb_inst_per_class=10,\n", " seed=None,\n", " n_jobs=-1)\n", - "st = sast.fit_transform(X, y)\n", + "st = rsast.fit_transform(X, y)\n", "print(\" Shape of transformed data = \", st.shape)\n", "print(\" Distance of second series to third shapelet = \", st[1][2])\n", "#testX, testy = load_basic_motions(split=\"test\")\n", "testX, testy = load_classification(name=\"Chinatown\",split=\"train\")\n", - "tr_test = sast.transform(testX)\n", + "tr_test = rsast.transform(testX)\n", "rf = RandomForestClassifier(random_state=10)\n", "rf.fit(st, y)\n", "preds = rf.predict(tr_test)\n", From 73d8eac21a436d0ae5c2ee502924bd4da57b2403 Mon Sep 17 00:00:00 2001 From: Nicolas Rojas Varela Date: Sun, 7 Apr 2024 13:40:49 +0200 Subject: [PATCH 05/38] deleted example --- .../shapelet_based/_rsast_classifier.py | 14 +- .../collection/shapelet_based/_rsast.py | 80 +-- .../shapelet_based (RSAST).ipynb | 519 ------------------ 3 files changed, 32 insertions(+), 581 deletions(-) delete mode 100644 examples/classification/shapelet_based (RSAST).ipynb diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index 4d7b800a44..b374b4053a 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -6,6 +6,7 @@ __maintainer__ = [] __all__ = ["RSASTClassifier"] + from operator import itemgetter import numpy as np @@ -16,6 +17,7 @@ from aeon.classification import BaseClassifier from aeon.transformations.collection.shapelet_based import RSAST from aeon.utils.numba.general import z_normalise_series +import matplotlib.pyplot as plt class RSASTClassifier(BaseClassifier): @@ -68,8 +70,8 @@ def __init__( n_jobs=-1, ): super().__init__() - self.n_random_points = n_random_points, - self.len_method = len_method, + self.n_random_points = n_random_points + self.len_method = len_method self.nb_inst_per_class = nb_inst_per_class self.n_jobs = n_jobs self.seed = seed @@ -152,7 +154,8 @@ def _predict_proba(self, X): dists[i, np.where(self.classes_ == preds[i])] = 1 return dists - def plot_most_important_feature_on_ts(self, ts, feature_importance, limit=5): + def plot_most_important_feature_on_ts(self, ts,feature_importance, limit=5): + """Plot the most important features on ts. Parameters @@ -169,8 +172,6 @@ def plot_most_important_feature_on_ts(self, ts, feature_importance, limit=5): fig : plt.figure The figure """ - import matplotlib.pyplot as plt - features = zip(self._transformer._kernel_orig, feature_importance) sorted_features = sorted(features, key=itemgetter(1), reverse=True) @@ -195,4 +196,5 @@ def plot_most_important_feature_on_ts(self, ts, feature_importance, limit=5): axes[f].plot(range(ts.size), ts, linewidth=2) axes[f].set_title(f"feature: {f+1}") - return fig + #return fig + diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 3bca11fef6..2ee2b343e0 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -1,6 +1,3 @@ -import sys -sys.path.append(r'C:\Users\nicol\aeon') - import numpy as np from numba import get_num_threads, njit, prange, set_num_threads @@ -105,9 +102,10 @@ def __init__( self.n_jobs = n_jobs self.seed = seed self._kernels = None # z-normalized subsequences - self._kernel_orig = None # non z-normalized subsequences + self._cand_length_list = {} + self._kernel_orig = [] self._kernels_generators = {} # Reference time series - self._cand_length_list = None + super().__init__() @@ -141,15 +139,8 @@ def _fit(self, X, y): classes = np.unique(y) self._num_classes = classes.shape[0] - candidates_ts = [] - y = np.asarray([str(x_s) for x_s in y]) - self.cand_length_list = {} - self.kernel_orig_ = [] - self.kernels_generators_ = [] - self.class_generators_ = [] - - list_kernels =[] + y = np.asarray([str(x_s) for x_s in y]) @@ -203,8 +194,10 @@ def _fit(self, X, y): choosen = self._random_state.permutation(X_c.shape[0])[:cnt] + self._kernels_generators[c] = [] + for rep, idx in enumerate(choosen): - self.cand_length_list[c+","+str(idx)+","+str(rep)] = [] + self._cand_length_list[c+","+str(idx)+","+str(rep)] = [] non_zero_acf=[] if (self.len_method == "both" or self.len_method == "ACF" or self.len_method == "Max ACF") : #2.1-- Compute Autorrelation per object @@ -216,9 +209,9 @@ def _fit(self, X, y): #Consider just the maximum ACF value if prev_acf!=0 and self.len_method == "Max ACF": non_zero_acf.remove(prev_acf) - self.cand_length_list[c+","+str(idx)+","+str(rep)].remove(prev_acf) + self._cand_length_list[c+","+str(idx)+","+str(rep)].remove(prev_acf) non_zero_acf.append(j) - self.cand_length_list[c+","+str(idx)+","+str(rep)].append(j) + self._cand_length_list[c+","+str(idx)+","+str(rep)].append(j) prev_acf=j non_zero_pacf=[] @@ -232,21 +225,21 @@ def _fit(self, X, y): #Consider just the maximum PACF value if prev_pacf!=0 and self.len_method == "Max PACF": non_zero_pacf.remove(prev_pacf) - self.cand_length_list[c+","+str(idx)+","+str(rep)].remove(prev_pacf) + self._cand_length_list[c+","+str(idx)+","+str(rep)].remove(prev_pacf) non_zero_pacf.append(j) - self.cand_length_list[c+","+str(idx)+","+str(rep)].append(j) + self._cand_length_list[c+","+str(idx)+","+str(rep)].append(j) prev_pacf=j if (self.len_method == "all"): - self.cand_length_list[c+","+str(idx)+","+str(rep)].extend(np.arange(3,1+ len(X_c[idx]))) + self._cand_length_list[c+","+str(idx)+","+str(rep)].extend(np.arange(3,1+ len(X_c[idx]))) #2.3-- Save the maximum autocorralated lag value as shapelet lenght - if len(self.cand_length_list[c+","+str(idx)+","+str(rep)])==0: + if len(self._cand_length_list[c+","+str(idx)+","+str(rep)])==0: #chose a random lenght using the lenght of the time series (added 1 since the range start in 0) rand_value= self._random_state.choice(len(X_c[idx]), 1)[0]+1 - self.cand_length_list[c+","+str(idx)+","+str(rep)].extend([max(3,rand_value)]) + self._cand_length_list[c+","+str(idx)+","+str(rep)].extend([max(3,rand_value)]) #elif len(non_zero_acf)==0: #print("There is no AC in TS", idx, " of class ",c) #elif len(non_zero_pacf)==0: @@ -257,9 +250,9 @@ def _fit(self, X, y): #print("Kernel lenght list:",self.cand_length_list[c+","+str(idx)],"") #remove duplicates for the list of lenghts - self.cand_length_list[c+","+str(idx)+","+str(rep)]=list(set(self.cand_length_list[c+","+str(idx)+","+str(rep)])) + self._cand_length_list[c+","+str(idx)+","+str(rep)]=list(set(self._cand_length_list[c+","+str(idx)+","+str(rep)])) #print("Len list:"+str(self.cand_length_list[c+","+str(idx)+","+str(rep)])) - for max_shp_length in self.cand_length_list[c+","+str(idx)+","+str(rep)]: + for max_shp_length in self._cand_length_list[c+","+str(idx)+","+str(rep)]: #2.4-- Choose randomly n_random_points point for a TS #2.5-- calculate the weights of probabilities for a random point in a TS @@ -288,29 +281,29 @@ def _fit(self, X, y): for i in rand_point_ts: #2.6-- Extract the subsequence with that point - kernel = X_c[idx][i:i+max_shp_length].reshape(1,-1) + kernel = X_c[idx][i:i+max_shp_length].reshape(1,-1).copy() #print("kernel:"+str(kernel)) if m_kernel < max_shp_length: m_kernel = max_shp_length - list_kernels.append(kernel) - self.kernel_orig_.append(np.squeeze(kernel)) - self.kernels_generators_.append(np.squeeze(X_c[idx].reshape(1,-1))) - self.class_generators_.append(c) + + self._kernel_orig.append(np.squeeze(kernel)) + self._kernels_generators[c].extend(X_c[idx].reshape(1,-1)) + + - print("total kernels:"+str(len(self.kernel_orig_))) #3--save the calculated subsequences - n_kernels = len (self.kernel_orig_) + n_kernels = len (self._kernel_orig) self._kernels = np.full( (n_kernels, m_kernel), dtype=np.float32, fill_value=np.nan) - for k, kernel in enumerate(self.kernel_orig_): + for k, kernel in enumerate(self._kernel_orig): self._kernels[k, :len(kernel)] = z_normalise_series(kernel) return self @@ -344,28 +337,3 @@ def _transform(self, X, y=None): return X_transformed -if __name__ == "__main__": - - from sklearn.ensemble import RandomForestClassifier - - from aeon.datasets import load_basic_motions,load_classification - - from aeon.transformations.collection.shapelet_based import RSAST - - from sklearn.metrics import accuracy_score - - #X, y = load_basic_motions(split="train") - X, y = load_classification(name="Chinatown",split="train") - rsast = RSAST( ) - - rsast.fit(X, y) - st = rsast.transform(X, y) - print(" Shape of transformed data = ", st.shape) - print(" Distance of second series to third shapelet = ", st[1][2]) - #testX, testy = load_basic_motions(split="test") - testX, testy = load_classification(name="Chinatown",split="train") - tr_test = rsast.transform(testX) - rf = RandomForestClassifier(random_state=10) - rf.fit(st, y) - preds = rf.predict(tr_test) - print(" Shapelets + random forest acc = ", accuracy_score(preds, testy)) \ No newline at end of file diff --git a/examples/classification/shapelet_based (RSAST).ipynb b/examples/classification/shapelet_based (RSAST).ipynb deleted file mode 100644 index 06586b10d6..0000000000 --- a/examples/classification/shapelet_based (RSAST).ipynb +++ /dev/null @@ -1,519 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "# Shapelet based time series machine learning\n", - "\n", - "Shapelets a subsections of times series taken from the train data that are a useful for time series machine learning. They were first proposed ia primitive for machine learning [1][2] and were embedded in a decision tree for classification. The Shapelet Transform Classifier (STC)[3,4] is a pipeline classifier which searches the training data for shapelets, transforms series to vectors of distances to a filtered set of selected shapelets based on information gain, then builds a classifier on the latter.\n", - "\n", - "Finding shapelets involves selecting and evaluating shapelets. The original shapelet tree and STC performed a full enumeration of all possible shapelets before keeping the best ones. This is computationally inefficient, and modern shapelet based machine learning algorithms randomise the search." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.append(r'C:\\Users\\nicol\\aeon')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[('MrSQMClassifier',\n", - " aeon.classification.shapelet_based._mrsqm.MrSQMClassifier),\n", - " ('RDSTClassifier', aeon.classification.shapelet_based._rdst.RDSTClassifier),\n", - " ('ShapeletTransformClassifier',\n", - " aeon.classification.shapelet_based._stc.ShapeletTransformClassifier)]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import warnings\n", - "\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.metrics import accuracy_score\n", - "\n", - "from aeon.datasets import load_basic_motions\n", - "from aeon.registry import all_estimators\n", - "from aeon.transformations.collection.shapelet_based import RandomShapeletTransform\n", - "\n", - "warnings.filterwarnings(\"ignore\")\n", - "all_estimators(\"classifier\", filter_tags={\"algorithm_type\": \"shapelet\"})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "### Shapelet Transform for Classification\n", - "\n", - "The `RandomShapeletTransform` transformer takes a set of labelled training time series in the `fit` function, randomly samples `n_shapelet_samples` shapelets, keeping the best `max_shapelets`. The resulting shapelets are used in the `transform` function to create a new tabular dataset, where each row represents a time series instance, and each column stores the distance from a time series to a shapelet. The resulting tabular data can be used by any scikit learn compatible classifier. In this notebook we will explain these terms and describe how the algorithm works. But first we show it in action. We will use the BasicMotions data as an example. This data set contains time series of motion traces for the activities \"running\", \"walking\", \"standing\" and \"badminton\". The learning problem is to predict the activity given the time series. Each time series has six channels: x, y, z position and x, y, z accelerometer of the wrist. Data was recorded on a smart watch." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Shape of transformed data = (40, 8)\n", - " Distance of second series to third shapelet = 1.302772121165026\n", - " Shapelets + random forest acc = 0.95\n" - ] - } - ], - "source": [ - "X, y = load_basic_motions(split=\"train\")\n", - "rst = RandomShapeletTransform(n_shapelet_samples=100, max_shapelets=10, random_state=42)\n", - "st = rst.fit_transform(X, y)\n", - "print(\" Shape of transformed data = \", st.shape)\n", - "print(\" Distance of second series to third shapelet = \", st[1][2])\n", - "testX, testy = load_basic_motions(split=\"test\")\n", - "tr_test = rst.transform(testX)\n", - "rf = RandomForestClassifier(random_state=10)\n", - "rf.fit(st, y)\n", - "preds = rf.predict(tr_test)\n", - "print(\" Shapelets + random forest acc = \", accuracy_score(preds, testy))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "### Visualising Shapelets\n", - "The first column of the transformed data represents the distance from the first shapelet to each time series. The shapelets are sorted, so the first shapelet is the one we estimate is the best (using the calculation described below). You can recover the shapelets from the transform. Each shapelet is a 7-tuple, storing the following information:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Quality = 0.81127812\n", - "Length = 39\n", - "position = 55\n", - "Channel = 0\n", - "Origin Instance Index = 11\n", - "Class label = running\n", - "Shapelet = [-0.85667017 -1.88711152 -0.8751295 0.80633757 1.10838333 0.69810992\n", - " 0.85713394 1.23190921 0.01801365 -1.29683966 -1.94694259 -0.37487726\n", - " -0.37487726 1.39471462 0.74922685 0.74922685 0.22343376 0.22343376\n", - " -0.7730703 -1.37591995 -0.80376393 1.32758071 0.99778845 0.6013481\n", - " 0.83711118 0.93684593 0.93684593 -1.30429475 -1.64522057 -0.56312308\n", - " 0.96855713 0.56796251 0.35714242 0.62066541 0.65135287 -0.80531237\n", - " -1.49170075 -1.18512797 0.69685753]\n" - ] - } - ], - "source": [ - "running_shapelet = rst.shapelets[0]\n", - "print(\"Quality = \", running_shapelet[0])\n", - "print(\"Length = \", running_shapelet[1])\n", - "print(\"position = \", running_shapelet[2])\n", - "print(\"Channel = \", running_shapelet[3])\n", - "print(\"Origin Instance Index = \", running_shapelet[4])\n", - "print(\"Class label = \", running_shapelet[5])\n", - "print(\"Shapelet = \", running_shapelet[6])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "We can directly extract shapelets and inspect them. These are the the two shapelets that are best at discriminating badminton and running against other activities. All shapelets are normalised to provide scale invariance." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Badminton shapelet from channel 0 (x-dimension) (0.65194393, 74, 7, 1, 1, 'standing', array([-5.27667376, -0.94911454, 0.90433173, 1.26316864, 2.34760078,\n", - " 1.84408 , 0.9192852 , 0.9192852 , -1.29868372, -1.29868372,\n", - " -1.5476774 , -1.03000413, 0.27593674, -0.70184658, 0.37460295,\n", - " 1.27398121, 1.02881837, 0.64543662, -0.0669839 , -0.54373096,\n", - " -0.55716134, -0.56605101, -0.08611633, 0.31270572, 0.25642625,\n", - " 0.5512744 , 0.78929504, 0.73385326, 0.73385326, -0.26777726,\n", - " -0.63967737, -0.63967737, -0.5539071 , -0.5539071 , 0.3867047 ,\n", - " 0.3867047 , 0.88832979, 0.85074214, 0.46901267, 0.0925433 ,\n", - " -0.34444436, -0.72498936, -0.83763127, -0.53034818, -0.05869122,\n", - " 0.46600593, 1.02537238, 0.81800526, 0.51709059, 0.17497366,\n", - " -0.31072836, -0.64876695, -0.89102368, -0.60834799, -0.0627886 ,\n", - " 0.42532723, 0.95696668, 0.91077086, 0.77491818, 0.14283377,\n", - " 0.14283377, -1.08722874, -1.08722874, -0.65706914, -0.65706914,\n", - " 0.28210933, 0.74159654, 0.8064869 , 0.8064869 , 0.19889294,\n", - " -0.16601048, -0.78706337, -0.76364317, -0.63789726]))\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "badminton_shapelet = rst.shapelets[4]\n", - "print(\" Badminton shapelet from channel 0 (x-dimension)\", badminton_shapelet)\n", - "plt.title(\"Best shapelets for running and badminton\")\n", - "plt.plot(badminton_shapelet[6], label=\"Badminton\")\n", - "plt.plot(running_shapelet[6], label=\"Running\")\n", - "plt.legend()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "Both shapelets are in the x-axis, so represent side to side motion. Badminton is characterised by sa single large peak in one direction, capturing the drawing of the hand back and quickly hittig the shuttlcock. Running is chaaracterised by a longer repetition of side to side motions, with a sharper peak representing bringing the arm forward accross the body in a running motion." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "## Performance on the UCR univariate datasets\n", - "\n", - "You can find the interval based classifiers as follows." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MrSQMClassifier\n", - "RDSTClassifier\n", - "ShapeletTransformClassifier\n" - ] - } - ], - "source": [ - "from aeon.registry import all_estimators\n", - "\n", - "est = [\"MrSQMClassifier\", \"RDSTClassifier\", \"ShapeletTransformClassifier\"]\n", - "for c in est:\n", - " print(c)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(112, 3)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from aeon.benchmarking import get_estimator_results_as_array\n", - "from aeon.datasets.tsc_data_lists import univariate\n", - "\n", - "names = [t.replace(\"Classifier\", \"\") for t in est]\n", - "results, present_names = get_estimator_results_as_array(\n", - " names, univariate, include_missing=False\n", - ")\n", - "results.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(
, )" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from aeon.visualisation import plot_boxplot_median, plot_critical_difference\n", - "\n", - "plot_critical_difference(results, names)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(
, )" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plot_boxplot_median(results, names)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Scalable and Accurate Subsequence Transform (SAST)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false, - "pycharm": { - "is_executing": true - } - }, - "outputs": [], - "source": [ - "\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "\n", - "from aeon.datasets import load_basic_motions,load_classification\n", - "\n", - "from aeon.transformations.collection.shapelet_based import SAST\n" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Shape of transformed data = (20, 504)\n", - " Distance of second series to third shapelet = 0.0004470423\n", - " Shapelets + random forest acc = 1.0\n" - ] - } - ], - "source": [ - "#X, y = load_basic_motions(split=\"train\")\n", - "X, y = load_classification(name=\"Chinatown\",split=\"train\")\n", - "sast = SAST(lengths=None,\n", - " stride=1,\n", - " nb_inst_per_class=1,\n", - " seed=42,\n", - " n_jobs=-1)\n", - "st = sast.fit_transform(X, y)\n", - "print(\" Shape of transformed data = \", st.shape)\n", - "print(\" Distance of second series to third shapelet = \", st[1][2])\n", - "#testX, testy = load_basic_motions(split=\"test\")\n", - "testX, testy = load_classification(name=\"Chinatown\",split=\"train\")\n", - "tr_test = sast.transform(testX)\n", - "rf = RandomForestClassifier(random_state=10)\n", - "rf.fit(st, y)\n", - "preds = rf.predict(tr_test)\n", - "print(\" Shapelets + random forest acc = \", accuracy_score(preds, testy))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Random Scalable and Accurate Subsequence Transform (RSAST)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.append(r'C:\\Users\\nicol\\aeon')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false, - "pycharm": { - "is_executing": true - } - }, - "outputs": [], - "source": [ - "from sklearn.ensemble import RandomForestClassifier\n", - "\n", - "from aeon.datasets import load_basic_motions,load_classification\n", - "\n", - "from aeon.transformations.collection.shapelet_based import RSAST\n", - "\n", - "from sklearn.metrics import accuracy_score" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total kernels:570\n", - " Shape of transformed data = (20, 570)\n", - " Distance of second series to third shapelet = 0.16883065\n", - " Shapelets + random forest acc = 1.0\n" - ] - } - ], - "source": [ - "#X, y = load_basic_motions(split=\"train\")\n", - "X, y = load_classification(name=\"Chinatown\",split=\"train\")\n", - "rsast = RSAST(n_random_points=10,\n", - " len_method=\"both\",\n", - " nb_inst_per_class=10,\n", - " seed=None,\n", - " n_jobs=-1)\n", - "st = rsast.fit_transform(X, y)\n", - "print(\" Shape of transformed data = \", st.shape)\n", - "print(\" Distance of second series to third shapelet = \", st[1][2])\n", - "#testX, testy = load_basic_motions(split=\"test\")\n", - "testX, testy = load_classification(name=\"Chinatown\",split=\"train\")\n", - "tr_test = rsast.transform(testX)\n", - "rf = RandomForestClassifier(random_state=10)\n", - "rf.fit(st, y)\n", - "preds = rf.predict(tr_test)\n", - "print(\" Shapelets + random forest acc = \", accuracy_score(preds, testy))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.2" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} From cf6582dddf8ab2df7fb2338344d66427c1557f23 Mon Sep 17 00:00:00 2001 From: Nicolas Rojas Varela Date: Sun, 7 Apr 2024 13:47:43 +0200 Subject: [PATCH 06/38] updated init --- aeon/classification/shapelet_based/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aeon/classification/shapelet_based/__init__.py b/aeon/classification/shapelet_based/__init__.py index f810ae0259..afd5fff3fa 100644 --- a/aeon/classification/shapelet_based/__init__.py +++ b/aeon/classification/shapelet_based/__init__.py @@ -5,7 +5,7 @@ "ShapeletTransformClassifier", "RDSTClassifier", "SASTClassifier", - "RSASTClassifier", + "RSASTClassifier" ] from aeon.classification.shapelet_based._mrsqm import MrSQMClassifier From f5eab2d73037939567e081401bdf6efeb5601661 Mon Sep 17 00:00:00 2001 From: Nicolas Rojas Varela Date: Sun, 7 Apr 2024 15:47:24 +0200 Subject: [PATCH 07/38] included LearningShapeletClassifier --- aeon/classification/shapelet_based/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/aeon/classification/shapelet_based/__init__.py b/aeon/classification/shapelet_based/__init__.py index afd5fff3fa..2c24102207 100644 --- a/aeon/classification/shapelet_based/__init__.py +++ b/aeon/classification/shapelet_based/__init__.py @@ -5,9 +5,10 @@ "ShapeletTransformClassifier", "RDSTClassifier", "SASTClassifier", - "RSASTClassifier" + "RSASTClassifier", + "LearningShapeletClassifier", ] - +from aeon.classification.shapelet_based._ls import LearningShapeletClassifier from aeon.classification.shapelet_based._mrsqm import MrSQMClassifier from aeon.classification.shapelet_based._rdst import RDSTClassifier from aeon.classification.shapelet_based._sast_classifier import SASTClassifier From 2ec5ab398d5ae7da13eaa70166e988eaa47439d1 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 10:25:54 +0200 Subject: [PATCH 08/38] updated format comments --- .../shapelet_based/_rsast_classifier.py | 8 +- .../collection/shapelet_based/__init__.py | 2 +- .../collection/shapelet_based/_rsast.py | 86 +++++-------------- 3 files changed, 27 insertions(+), 69 deletions(-) diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index b374b4053a..61a12119ba 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -26,7 +26,9 @@ class RSASTClassifier(BaseClassifier): Parameters ---------- n_random_points: int default = 10 the number of initial random points to extract - len_method: string default="both" the type of statistical tool used to get the length of shapelets. "both"=ACF&PACF, "ACF"=ACF, "PACF"=PACF, "None"=Extract randomly any length from the TS + len_method: string default="both" the type of statistical tool used to get the + length of shapelets. "both"=ACF&PACF, "ACF"=ACF, "PACF"=PACF, + "None"=Extract randomly any length from the TS nb_inst_per_class : int default = 10 the number of reference time series to select per class seed : int, default = None @@ -154,7 +156,7 @@ def _predict_proba(self, X): dists[i, np.where(self.classes_ == preds[i])] = 1 return dists - def plot_most_important_feature_on_ts(self, ts,feature_importance, limit=5): + def plot_most_important_feature_on_ts(self, ts, feature_importance, limit=5): """Plot the most important features on ts. @@ -196,5 +198,5 @@ def plot_most_important_feature_on_ts(self, ts,feature_importance, limit=5): axes[f].plot(range(ts.size), ts, linewidth=2) axes[f].set_title(f"feature: {f+1}") - #return fig + diff --git a/aeon/transformations/collection/shapelet_based/__init__.py b/aeon/transformations/collection/shapelet_based/__init__.py index 5b134c2c56..e7851a5dbe 100644 --- a/aeon/transformations/collection/shapelet_based/__init__.py +++ b/aeon/transformations/collection/shapelet_based/__init__.py @@ -1,6 +1,6 @@ """Shapelet based transformers.""" -__all__ = ["RandomShapeletTransform", "RandomDilatedShapeletTransform", "SAST", "RSAST" ] +__all__ = ["RandomShapeletTransform", "RandomDilatedShapeletTransform", "SAST", "RSAST"] from aeon.transformations.collection.shapelet_based._dilated_shapelet_transform import ( RandomDilatedShapeletTransform, diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 2ee2b343e0..53611d9eb0 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -127,7 +127,7 @@ def _fit(self, X, y): """ - #0- initialize variables and convert values in "y" to string + # 0- initialize variables and convert values in "y" to string X_ = np.reshape(X, (X.shape[0], X.shape[-1])) self._random_state = ( @@ -139,58 +139,41 @@ def _fit(self, X, y): classes = np.unique(y) self._num_classes = classes.shape[0] - y = np.asarray([str(x_s) for x_s in y]) - - n = [] classes = np.unique(y) self.num_classes = classes.shape[0] m_kernel = 0 - #1--calculate ANOVA per each time t throught the lenght of the TS + # 1--calculate ANOVA per each time t throught the lenght of the TS for i in range (X_.shape[1]): statistic_per_class= {} for c in classes: assert len(X_[np.where(y==c)[0]][:,i])> 0, 'Time t without values in TS' - statistic_per_class[c]=X_[np.where(y==c)[0]][:,i] - #print("statistic_per_class- i:"+str(i)+', c:'+str(c)) - #print(statistic_per_class[c].shape) - - - #print('Without pd series') - #print(statistic_per_class) statistic_per_class=pd.Series(statistic_per_class) - #statistic_per_class = list(statistic_per_class.values()) # Calculate t-statistic and p-value - try: t_statistic, p_value = f_oneway(*statistic_per_class) except DegenerateDataWarning or ConstantInputWarning: p_value = np.nan - - #print('statistic_per_class', str(statistic_per_class)) + # Interpretation of the results # if p_value < 0.05: " The means of the populations are significantly different." - #print('pvalue', str(p_value)) if np.isnan(p_value): n.append(0) else: n.append(1-p_value) - - - - #2--calculate PACF and ACF for each TS chossen in each class + # 2--calculate PACF and ACF for each TS chossen in each class for i, c in enumerate(classes): X_c = X_[y == c] cnt = np.min([self.nb_inst_per_class, X_c.shape[0]]).astype(int) - #set if the selection of instances is with replacement (if false it is not posible to select the same intance more than one) + # set if the selection of instances is with replacement (if false it is not posible to select the same intance more than one) choosen = self._random_state.permutation(X_c.shape[0])[:cnt] @@ -200,13 +183,13 @@ def _fit(self, X, y): self._cand_length_list[c+","+str(idx)+","+str(rep)] = [] non_zero_acf=[] if (self.len_method == "both" or self.len_method == "ACF" or self.len_method == "Max ACF") : - #2.1-- Compute Autorrelation per object + # 2.1-- Compute Autorrelation per object acf_val, acf_confint = acf(X_c[idx], nlags=len(X_c[idx])-1, alpha=.05) prev_acf=0 for j, conf in enumerate(acf_confint): if(3<=j and (0 < acf_confint[j][0] <= acf_confint[j][1] or acf_confint[j][0] <= acf_confint[j][1] < 0) ): - #Consider just the maximum ACF value + # Consider just the maximum ACF value if prev_acf!=0 and self.len_method == "Max ACF": non_zero_acf.remove(prev_acf) self._cand_length_list[c+","+str(idx)+","+str(rep)].remove(prev_acf) @@ -216,13 +199,13 @@ def _fit(self, X, y): non_zero_pacf=[] if (self.len_method == "both" or self.len_method == "PACF" or self.len_method == "Max PACF"): - #2.2 Compute Partial Autorrelation per object + # 2.2 Compute Partial Autorrelation per object pacf_val, pacf_confint = pacf(X_c[idx], method="ols", nlags=(len(X_c[idx])//2) - 1, alpha=.05) prev_pacf=0 for j, conf in enumerate(pacf_confint): if(3<=j and (0 < pacf_confint[j][0] <= pacf_confint[j][1] or pacf_confint[j][0] <= pacf_confint[j][1] < 0) ): - #Consider just the maximum PACF value + # Consider just the maximum PACF value if prev_pacf!=0 and self.len_method == "Max PACF": non_zero_pacf.remove(prev_pacf) self._cand_length_list[c+","+str(idx)+","+str(rep)].remove(prev_pacf) @@ -234,72 +217,47 @@ def _fit(self, X, y): if (self.len_method == "all"): self._cand_length_list[c+","+str(idx)+","+str(rep)].extend(np.arange(3,1+ len(X_c[idx]))) - #2.3-- Save the maximum autocorralated lag value as shapelet lenght - + # 2.3-- Save the maximum autocorralated lag value as shapelet lenght if len(self._cand_length_list[c+","+str(idx)+","+str(rep)])==0: - #chose a random lenght using the lenght of the time series (added 1 since the range start in 0) + # chose a random lenght using the lenght of the time series (added 1 since the range start in 0) rand_value= self._random_state.choice(len(X_c[idx]), 1)[0]+1 self._cand_length_list[c+","+str(idx)+","+str(rep)].extend([max(3,rand_value)]) - #elif len(non_zero_acf)==0: - #print("There is no AC in TS", idx, " of class ",c) - #elif len(non_zero_pacf)==0: - #print("There is no PAC in TS", idx, " of class ",c) - #else: - #print("There is AC and PAC in TS", idx, " of class ",c) - - #print("Kernel lenght list:",self.cand_length_list[c+","+str(idx)],"") - - #remove duplicates for the list of lenghts + self._cand_length_list[c+","+str(idx)+","+str(rep)]=list(set(self._cand_length_list[c+","+str(idx)+","+str(rep)])) - #print("Len list:"+str(self.cand_length_list[c+","+str(idx)+","+str(rep)])) + for max_shp_length in self._cand_length_list[c+","+str(idx)+","+str(rep)]: - - #2.4-- Choose randomly n_random_points point for a TS - #2.5-- calculate the weights of probabilities for a random point in a TS + # 2.4-- Choose randomly n_random_points point for a TS + # 2.5-- calculate the weights of probabilities for a random point in a TS if sum(n) == 0 : # Determine equal weights of a random point point in TS is there are no significant points - # print('All p values in One way ANOVA are equal to 0') weights = [1/len(n) for i in range(len(n))] weights = weights[:len(X_c[idx])-max_shp_length +1]/np.sum(weights[:len(X_c[idx])-max_shp_length+1]) else: # Determine the weights of a random point point in TS (excluding points after n-l+1) weights = n / np.sum(n) weights = weights[:len(X_c[idx])-max_shp_length +1]/np.sum(weights[:len(X_c[idx])-max_shp_length+1]) - - if self.n_random_points > len(X_c[idx])-max_shp_length+1 : - #set a upper limit for the posible of number of random points when selecting without replacement + # set a upper limit for the posible of number of random points when selecting without replacement limit_rpoint=len(X_c[idx])-max_shp_length+1 rand_point_ts = self._random_state.choice(len(X_c[idx])-max_shp_length+1, limit_rpoint, p=weights, replace=False) - #print("limit_rpoint:"+str(limit_rpoint)) + else: rand_point_ts = self._random_state.choice(len(X_c[idx])-max_shp_length+1, self.n_random_points, p=weights, replace=False) - - - - + for i in rand_point_ts: - #2.6-- Extract the subsequence with that point + # 2.6-- Extract the subsequence with that point kernel = X_c[idx][i:i+max_shp_length].reshape(1,-1).copy() - #print("kernel:"+str(kernel)) + if m_kernel < max_shp_length: m_kernel = max_shp_length self._kernel_orig.append(np.squeeze(kernel)) self._kernels_generators[c].extend(X_c[idx].reshape(1,-1)) - - - - - - - #3--save the calculated subsequences - + # 3--save the calculated subsequences n_kernels = len (self._kernel_orig) - self._kernels = np.full( (n_kernels, m_kernel), dtype=np.float32, fill_value=np.nan) @@ -310,7 +268,6 @@ def _fit(self, X, y): def _transform(self, X, y=None): """Transform the input X using the generated subsequences. - Parameters ---------- X: np.ndarray shape (n_cases, n_channels, n_timepoints) @@ -331,7 +288,6 @@ def _transform(self, X, y=None): set_num_threads(n_jobs) - X_transformed = _apply_kernels(X_, self._kernels) # subsequence transform of X set_num_threads(prev_threads) From c08038be5ef1d9fb7e6cd2040b35de0a6bab86a1 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 11:02:53 +0200 Subject: [PATCH 09/38] corrected spaces --- .../shapelet_based/_rsast_classifier.py | 20 +++---- .../collection/shapelet_based/_rsast.py | 55 ++++++++++--------- 2 files changed, 38 insertions(+), 37 deletions(-) diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index 61a12119ba..62d06b8359 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -41,7 +41,8 @@ class RSASTClassifier(BaseClassifier): Reference --------- - .. [1] Varela, N. R., Mbouopda, M. F., & Nguifo, E. M. (2023). RSAST: Sampling Shapelets for Time Series Classification. + .. [1] Varela, N. R., Mbouopda, M. F., & Nguifo, E. M. (2023). RSAST: Sampling + Shapelets for Time Series Classification. https://hal.science/hal-04311309/ Examples @@ -64,12 +65,12 @@ class RSASTClassifier(BaseClassifier): def __init__( self, - n_random_points=10, - len_method="both", - nb_inst_per_class=10, - seed=None, - classifier=None, - n_jobs=-1, + n_random_points = 10, + len_method = "both", + nb_inst_per_class = 10, + seed = None, + classifier = None, + n_jobs = -1, ): super().__init__() self.n_random_points = n_random_points @@ -97,7 +98,7 @@ def _fit(self, X, y): """ self._transformer = RSAST( self.n_random_points, - self.len_method, + self.len_method, self.nb_inst_per_class, self.seed, self.n_jobs, @@ -197,6 +198,3 @@ def plot_most_important_feature_on_ts(self, ts, feature_importance, limit=5): axes[f].plot(range(start_pos, start_pos + kernel.size), kernel, linewidth=5) axes[f].plot(range(ts.size), ts, linewidth=2) axes[f].set_title(f"feature: {f+1}") - - - diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 53611d9eb0..c729aff96d 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -40,19 +40,23 @@ def _apply_kernels(X, kernels): class RSAST(BaseCollectionTransformer): """Random Scalable and Accurate Subsequence Transform (SAST). - RSAST [1] is based on SAST, it uses a stratified sampling strategy for subsequences selection but additionally takes into account certain - statistical criteria such as ANOVA, ACF, and PACF to further reduce the search space of shapelets. + RSAST [1] is based on SAST, it uses a stratified sampling strategy for subsequences selection but + additionally takes into account certain statistical criteria such as ANOVA, ACF, and PACF to + further reduce the search space of shapelets. - RSAST starts with the pre-computation of a list of weights, using ANOVA, which helps in the selection of initial points for - subsequences. Then randomly select k time series per class, which are used with an ACF and PACF, obtaining a set of highly correlated - lagged values. These values are used as potential lengths for the shapelets. Lastly, with a pre-defined number of admissible starting - points to sample, the shapelets are extracted and used to transform the original dataset, replacing each time series by the vector of - its distance to each subsequence. + RSAST starts with the pre-computation of a list of weights, using ANOVA, which helps in the + selection of initial points for subsequences. Then randomly select k time series per class, + which are used with an ACF and PACF, obtaining a set of highly correlated lagged values. + These values are used as potential lengths for the shapelets. Lastly, with a pre-defined + number of admissible starting points to sample, the shapelets are extracted and used to + transform the original dataset, replacing each time series by the vector of its distance + to each subsequence. Parameters ---------- n_random_points: int default = 10 the number of initial random points to extract - len_method: string default="both" the type of statistical tool used to get the length of shapelets. "both"=ACF&PACF, "ACF"=ACF, "PACF"=PACF, "None"=Extract randomly any length from the TS + len_method: string default="both" the type of statistical tool used to get the length of + shapelets. "both"=ACF&PACF, "ACF"=ACF, "PACF"=PACF, "None"=Extract randomly any length from the TS nb_inst_per_class : int default = 10 the number of reference time series to select per class seed : int, default = None @@ -64,7 +68,8 @@ class RSAST(BaseCollectionTransformer): Reference --------- - .. [1] Varela, N. R., Mbouopda, M. F., & Nguifo, E. M. (2023). RSAST: Sampling Shapelets for Time Series Classification. + .. [1] Varela, N. R., Mbouopda, M. F., & Nguifo, E. M. (2023). RSAST: Sampling Shapelets for + Time Series Classification. https://hal.science/hal-04311309/ @@ -108,8 +113,6 @@ def __init__( super().__init__() - - def _fit(self, X, y): """Select reference time series and generate subsequences from them. @@ -148,10 +151,10 @@ def _fit(self, X, y): # 1--calculate ANOVA per each time t throught the lenght of the TS for i in range (X_.shape[1]): - statistic_per_class= {} + statistic_per_class = {} for c in classes: - assert len(X_[np.where(y==c)[0]][:,i])> 0, 'Time t without values in TS' - statistic_per_class[c]=X_[np.where(y==c)[0]][:,i] + assert len(X_[np.where(y == c)[0]][:,i]) > 0, 'Time t without values in TS' + statistic_per_class[c] = X_[np.where(y == c)[0]][:,i] statistic_per_class=pd.Series(statistic_per_class) # Calculate t-statistic and p-value @@ -181,7 +184,7 @@ def _fit(self, X, y): for rep, idx in enumerate(choosen): self._cand_length_list[c+","+str(idx)+","+str(rep)] = [] - non_zero_acf=[] + non_zero_acf = [] if (self.len_method == "both" or self.len_method == "ACF" or self.len_method == "Max ACF") : # 2.1-- Compute Autorrelation per object acf_val, acf_confint = acf(X_c[idx], nlags=len(X_c[idx])-1, alpha=.05) @@ -197,11 +200,11 @@ def _fit(self, X, y): self._cand_length_list[c+","+str(idx)+","+str(rep)].append(j) prev_acf=j - non_zero_pacf=[] + non_zero_pacf = [] if (self.len_method == "both" or self.len_method == "PACF" or self.len_method == "Max PACF"): # 2.2 Compute Partial Autorrelation per object - pacf_val, pacf_confint = pacf(X_c[idx], method="ols", nlags=(len(X_c[idx])//2) - 1, alpha=.05) - prev_pacf=0 + pacf_val, pacf_confint = pacf(X_c[idx], method = "ols", nlags=(len(X_c[idx])//2) - 1, alpha = .05) + prev_pacf = 0 for j, conf in enumerate(pacf_confint): if(3<=j and (0 < pacf_confint[j][0] <= pacf_confint[j][1] or pacf_confint[j][0] <= pacf_confint[j][1] < 0) ): @@ -218,12 +221,12 @@ def _fit(self, X, y): self._cand_length_list[c+","+str(idx)+","+str(rep)].extend(np.arange(3,1+ len(X_c[idx]))) # 2.3-- Save the maximum autocorralated lag value as shapelet lenght - if len(self._cand_length_list[c+","+str(idx)+","+str(rep)])==0: + if len(self._cand_length_list[c+","+str(idx)+","+str(rep)]) == 0: # chose a random lenght using the lenght of the time series (added 1 since the range start in 0) - rand_value= self._random_state.choice(len(X_c[idx]), 1)[0]+1 + rand_value = self._random_state.choice(len(X_c[idx]), 1)[0]+1 self._cand_length_list[c+","+str(idx)+","+str(rep)].extend([max(3,rand_value)]) - self._cand_length_list[c+","+str(idx)+","+str(rep)]=list(set(self._cand_length_list[c+","+str(idx)+","+str(rep)])) + self._cand_length_list[c+","+str(idx)+","+str(rep)] = list(set(self._cand_length_list[c+","+str(idx)+","+str(rep)])) for max_shp_length in self._cand_length_list[c+","+str(idx)+","+str(rep)]: # 2.4-- Choose randomly n_random_points point for a TS @@ -239,11 +242,11 @@ def _fit(self, X, y): if self.n_random_points > len(X_c[idx])-max_shp_length+1 : # set a upper limit for the posible of number of random points when selecting without replacement - limit_rpoint=len(X_c[idx])-max_shp_length+1 - rand_point_ts = self._random_state.choice(len(X_c[idx])-max_shp_length+1, limit_rpoint, p=weights, replace=False) + limit_rpoint = len(X_c[idx])-max_shp_length+1 + rand_point_ts = self._random_state.choice(len(X_c[idx])-max_shp_length+1, limit_rpoint, p = weights, replace = False) else: - rand_point_ts = self._random_state.choice(len(X_c[idx])-max_shp_length+1, self.n_random_points, p=weights, replace=False) + rand_point_ts = self._random_state.choice(len(X_c[idx])-max_shp_length+1, self.n_random_points, p = weights, replace = False) for i in rand_point_ts: # 2.6-- Extract the subsequence with that point @@ -259,14 +262,14 @@ def _fit(self, X, y): n_kernels = len (self._kernel_orig) self._kernels = np.full( - (n_kernels, m_kernel), dtype=np.float32, fill_value=np.nan) + (n_kernels, m_kernel), dtype = np.float32, fill_value = np.nan) for k, kernel in enumerate(self._kernel_orig): self._kernels[k, :len(kernel)] = z_normalise_series(kernel) return self - def _transform(self, X, y=None): + def _transform(self, X, y = None): """Transform the input X using the generated subsequences. Parameters ---------- From 8728d92fb2f25049e44f41b86ef023723bd2abdf Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 13:26:14 +0200 Subject: [PATCH 10/38] corrected identation --- .../shapelet_based/_rsast_classifier.py | 12 ++-- .../collection/shapelet_based/_rsast.py | 56 ++++++++++--------- 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index 62d06b8359..2bbaef9eae 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -65,12 +65,12 @@ class RSASTClassifier(BaseClassifier): def __init__( self, - n_random_points = 10, - len_method = "both", - nb_inst_per_class = 10, - seed = None, - classifier = None, - n_jobs = -1, + n_random_points=10, + len_method="both", + nb_inst_per_class=10, + seed=None, + classifier=None, + n_jobs=-1, ): super().__init__() self.n_random_points = n_random_points diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index c729aff96d..2eddbcdd21 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -40,23 +40,27 @@ def _apply_kernels(X, kernels): class RSAST(BaseCollectionTransformer): """Random Scalable and Accurate Subsequence Transform (SAST). - RSAST [1] is based on SAST, it uses a stratified sampling strategy for subsequences selection but - additionally takes into account certain statistical criteria such as ANOVA, ACF, and PACF to - further reduce the search space of shapelets. + RSAST [1] is based on SAST, it uses a stratified sampling strategy + for subsequences selection but additionally takes into account certain + statistical criteria such as ANOVA, ACF, and PACF to further reduce + the search space of shapelets. - RSAST starts with the pre-computation of a list of weights, using ANOVA, which helps in the - selection of initial points for subsequences. Then randomly select k time series per class, - which are used with an ACF and PACF, obtaining a set of highly correlated lagged values. - These values are used as potential lengths for the shapelets. Lastly, with a pre-defined - number of admissible starting points to sample, the shapelets are extracted and used to - transform the original dataset, replacing each time series by the vector of its distance - to each subsequence. + RSAST starts with the pre-computation of a list of weights, using ANOVA, + which helps in the selection of initial points for subsequences. Then + randomly select k time series per class, which are used with an ACF and PACF, + obtaining a set of highly correlated lagged values. These values are used as + potential lengths for the shapelets. Lastly, with a pre-defined number of + admissible starting points to sample, the shapelets are extracted and used to + transform the original dataset, replacing each time series by the vector of its + distance to each subsequence. Parameters ---------- n_random_points: int default = 10 the number of initial random points to extract - len_method: string default="both" the type of statistical tool used to get the length of - shapelets. "both"=ACF&PACF, "ACF"=ACF, "PACF"=PACF, "None"=Extract randomly any length from the TS + len_method: string default="both" the type of statistical tool used to get + the length of shapelets. "both"=ACF&PACF, "ACF"=ACF, "PACF"=PACF, + "None"=Extract randomly any length from the TS + nb_inst_per_class : int default = 10 the number of reference time series to select per class seed : int, default = None @@ -68,8 +72,8 @@ class RSAST(BaseCollectionTransformer): Reference --------- - .. [1] Varela, N. R., Mbouopda, M. F., & Nguifo, E. M. (2023). RSAST: Sampling Shapelets for - Time Series Classification. + .. [1] Varela, N. R., Mbouopda, M. F., & Nguifo, E. M. (2023). + RSAST: Sampling Shapelets for Time Series Classification. https://hal.science/hal-04311309/ @@ -95,11 +99,11 @@ class RSAST(BaseCollectionTransformer): def __init__( self, - n_random_points = 10, - len_method = "both", - nb_inst_per_class = 10, - seed = None, - n_jobs = -1, + n_random_points=10, + len_method="both", + nb_inst_per_class=10, + seed=None, + n_jobs=-1, ): self.n_random_points = n_random_points self.len_method = len_method @@ -130,7 +134,7 @@ def _fit(self, X, y): """ - # 0- initialize variables and convert values in "y" to string + # 0- initialize variables and convert values in "y" to string X_ = np.reshape(X, (X.shape[0], X.shape[-1])) self._random_state = ( @@ -150,13 +154,15 @@ def _fit(self, X, y): m_kernel = 0 # 1--calculate ANOVA per each time t throught the lenght of the TS - for i in range (X_.shape[1]): + for i in range(X_.shape[1]): statistic_per_class = {} for c in classes: - assert len(X_[np.where(y == c)[0]][:,i]) > 0, 'Time t without values in TS' - statistic_per_class[c] = X_[np.where(y == c)[0]][:,i] + assert len( + X_[np.where(y == c)[0]][:, i] + ) > 0, 'Time t without values in TS' + statistic_per_class[c] = X_[np.where(y == c)[0]][:, i] - statistic_per_class=pd.Series(statistic_per_class) + statistic_per_class = pd.Series(statistic_per_class) # Calculate t-statistic and p-value try: t_statistic, p_value = f_oneway(*statistic_per_class) @@ -218,7 +224,7 @@ def _fit(self, X, y): prev_pacf=j if (self.len_method == "all"): - self._cand_length_list[c+","+str(idx)+","+str(rep)].extend(np.arange(3,1+ len(X_c[idx]))) + self._cand_length_list[c+","+str(idx)+","+str(rep)].extend(np.arange(3, 1+ len(X_c[idx]))) # 2.3-- Save the maximum autocorralated lag value as shapelet lenght if len(self._cand_length_list[c+","+str(idx)+","+str(rep)]) == 0: From 1dc790348c2291458c4610a2604f5e79368b7ad7 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 13:49:53 +0200 Subject: [PATCH 11/38] updated identation --- .../collection/shapelet_based/_rsast.py | 93 +++++++++++++------ 1 file changed, 64 insertions(+), 29 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 2eddbcdd21..dee23146b7 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -166,11 +166,12 @@ def _fit(self, X, y): # Calculate t-statistic and p-value try: t_statistic, p_value = f_oneway(*statistic_per_class) - except DegenerateDataWarning or ConstantInputWarning: + except (DegenerateDataWarning, ConstantInputWarning): p_value = np.nan # Interpretation of the results - # if p_value < 0.05: " The means of the populations are significantly different." + # if p_value < 0.05: " The means of the populations are + # significantly different." if np.isnan(p_value): n.append(0) else: @@ -182,7 +183,6 @@ def _fit(self, X, y): X_c = X_[y == c] cnt = np.min([self.nb_inst_per_class, X_c.shape[0]]).astype(int) - # set if the selection of instances is with replacement (if false it is not posible to select the same intance more than one) choosen = self._random_state.permutation(X_c.shape[0])[:cnt] @@ -191,78 +191,113 @@ def _fit(self, X, y): for rep, idx in enumerate(choosen): self._cand_length_list[c+","+str(idx)+","+str(rep)] = [] non_zero_acf = [] - if (self.len_method == "both" or self.len_method == "ACF" or self.len_method == "Max ACF") : + if (self.len_method == "both" or + self.len_method == "ACF" or + self.len_method == "Max ACF"): # 2.1-- Compute Autorrelation per object - acf_val, acf_confint = acf(X_c[idx], nlags=len(X_c[idx])-1, alpha=.05) + acf_val, acf_confint = acf(X_c[idx], + nlags=len(X_c[idx])-1, alpha=.05) prev_acf=0 for j, conf in enumerate(acf_confint): - if(3<=j and (0 < acf_confint[j][0] <= acf_confint[j][1] or acf_confint[j][0] <= acf_confint[j][1] < 0) ): + if(3<=j and (0 < acf_confint[j][0] <= acf_confint[j][1] or + acf_confint[j][0] <= acf_confint[j][1] < 0) ): # Consider just the maximum ACF value if prev_acf!=0 and self.len_method == "Max ACF": non_zero_acf.remove(prev_acf) - self._cand_length_list[c+","+str(idx)+","+str(rep)].remove(prev_acf) + self._cand_length_list[ + c+","+str(idx)+","+str(rep) + ].remove(prev_acf) non_zero_acf.append(j) - self._cand_length_list[c+","+str(idx)+","+str(rep)].append(j) + self._cand_length_list[ + c+","+str(idx)+","+str(rep) + ].append(j) prev_acf=j non_zero_pacf = [] - if (self.len_method == "both" or self.len_method == "PACF" or self.len_method == "Max PACF"): + if (self.len_method == "both" or + self.len_method == "PACF" or self.len_method == "Max PACF"): # 2.2 Compute Partial Autorrelation per object - pacf_val, pacf_confint = pacf(X_c[idx], method = "ols", nlags=(len(X_c[idx])//2) - 1, alpha = .05) + pacf_val, pacf_confint = pacf(X_c[idx], method = "ols", + nlags=(len(X_c[idx])//2) - 1, + alpha = .05) prev_pacf = 0 for j, conf in enumerate(pacf_confint): - if(3<=j and (0 < pacf_confint[j][0] <= pacf_confint[j][1] or pacf_confint[j][0] <= pacf_confint[j][1] < 0) ): + if(3<=j and (0 < pacf_confint[j][0] <= pacf_confint[j][1] or + pacf_confint[j][0] <= pacf_confint[j][1] < 0) ): # Consider just the maximum PACF value if prev_pacf!=0 and self.len_method == "Max PACF": non_zero_pacf.remove(prev_pacf) - self._cand_length_list[c+","+str(idx)+","+str(rep)].remove(prev_pacf) + self._cand_length_list[ + c+","+str(idx)+","+str(rep) + ].remove(prev_pacf) non_zero_pacf.append(j) - self._cand_length_list[c+","+str(idx)+","+str(rep)].append(j) + self._cand_length_list[ + c+","+str(idx)+","+str(rep) + ].append(j) prev_pacf=j if (self.len_method == "all"): - self._cand_length_list[c+","+str(idx)+","+str(rep)].extend(np.arange(3, 1+ len(X_c[idx]))) + self._cand_length_list[ + c+","+str(idx)+","+str(rep) + ].extend(np.arange(3, 1+ len(X_c[idx]))) - # 2.3-- Save the maximum autocorralated lag value as shapelet lenght + # 2.3-- Save the maximum autocorralated lag value as shapelet lenght if len(self._cand_length_list[c+","+str(idx)+","+str(rep)]) == 0: - # chose a random lenght using the lenght of the time series (added 1 since the range start in 0) + # chose a random lenght using the lenght of the time series + # (added 1 since the range start in 0) rand_value = self._random_state.choice(len(X_c[idx]), 1)[0]+1 - self._cand_length_list[c+","+str(idx)+","+str(rep)].extend([max(3,rand_value)]) + self._cand_length_list[ + c+","+str(idx)+","+str(rep) + ].extend([max(3, rand_value)]) - self._cand_length_list[c+","+str(idx)+","+str(rep)] = list(set(self._cand_length_list[c+","+str(idx)+","+str(rep)])) + self._cand_length_list[ + c+","+str(idx)+","+str(rep) + ] = list(set(self._cand_length_list[c+","+str(idx)+","+str(rep)])) - for max_shp_length in self._cand_length_list[c+","+str(idx)+","+str(rep)]: + for max_shp_length in self._cand_length_list[ + c+","+str(idx)+","+str(rep) + ]: # 2.4-- Choose randomly n_random_points point for a TS - # 2.5-- calculate the weights of probabilities for a random point in a TS + # 2.5-- calculate the weights of probabilities for a random point + # in a TS if sum(n) == 0 : - # Determine equal weights of a random point point in TS is there are no significant points + # Determine equal weights of a random point point in TS is + # there are no significant points weights = [1/len(n) for i in range(len(n))] - weights = weights[:len(X_c[idx])-max_shp_length +1]/np.sum(weights[:len(X_c[idx])-max_shp_length+1]) + weights = weights[:len(X_c[idx])-max_shp_length +1]/np.sum( + weights[:len(X_c[idx])-max_shp_length+1]) else: - # Determine the weights of a random point point in TS (excluding points after n-l+1) + # Determine the weights of a random point point in TS + # (excluding points after n-l+1) weights = n / np.sum(n) - weights = weights[:len(X_c[idx])-max_shp_length +1]/np.sum(weights[:len(X_c[idx])-max_shp_length+1]) + weights = weights[:len(X_c[idx])-max_shp_length +1]/np.sum( + weights[:len(X_c[idx])-max_shp_length+1]) if self.n_random_points > len(X_c[idx])-max_shp_length+1 : - # set a upper limit for the posible of number of random points when selecting without replacement + # set a upper limit for the posible of number of random + # points when selecting without replacement limit_rpoint = len(X_c[idx])-max_shp_length+1 - rand_point_ts = self._random_state.choice(len(X_c[idx])-max_shp_length+1, limit_rpoint, p = weights, replace = False) + rand_point_ts = self._random_state.choice( + len(X_c[idx])-max_shp_length+1, limit_rpoint, + p = weights, replace = False) else: - rand_point_ts = self._random_state.choice(len(X_c[idx])-max_shp_length+1, self.n_random_points, p = weights, replace = False) + rand_point_ts = self._random_state.choice( + len(X_c[idx])-max_shp_length+1, self.n_random_points, + p = weights, replace = False) for i in rand_point_ts: # 2.6-- Extract the subsequence with that point - kernel = X_c[idx][i:i+max_shp_length].reshape(1,-1).copy() + kernel = X_c[idx][i:i+max_shp_length].reshape(1, -1).copy() if m_kernel < max_shp_length: m_kernel = max_shp_length self._kernel_orig.append(np.squeeze(kernel)) - self._kernels_generators[c].extend(X_c[idx].reshape(1,-1)) + self._kernels_generators[c].extend(X_c[idx].reshape(1, -1)) # 3--save the calculated subsequences n_kernels = len (self._kernel_orig) From d9dfda537fd8c61f21508ffc8c75dcd4287cb467 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 14:16:17 +0200 Subject: [PATCH 12/38] updated identation --- .../collection/shapelet_based/_rsast.py | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index dee23146b7..abc766a1c7 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -103,8 +103,7 @@ def __init__( len_method="both", nb_inst_per_class=10, seed=None, - n_jobs=-1, - ): + n_jobs=-1,): self.n_random_points = n_random_points self.len_method = len_method self.nb_inst_per_class = nb_inst_per_class @@ -140,8 +139,7 @@ def _fit(self, X, y): self._random_state = ( np.random.RandomState(self.seed) if not isinstance(self.seed, np.random.RandomState) - else self.seed - ) + else self.seed) classes = np.unique(y) self._num_classes = classes.shape[0] @@ -158,8 +156,9 @@ def _fit(self, X, y): statistic_per_class = {} for c in classes: assert len( - X_[np.where(y == c)[0]][:, i] - ) > 0, 'Time t without values in TS' + X_[ + np.where(y == c)[0] + ][:, i]) > 0, 'Time t without values in TS' statistic_per_class[c] = X_[np.where(y == c)[0]][:, i] statistic_per_class = pd.Series(statistic_per_class) @@ -191,19 +190,20 @@ def _fit(self, X, y): for rep, idx in enumerate(choosen): self._cand_length_list[c+","+str(idx)+","+str(rep)] = [] non_zero_acf = [] - if (self.len_method == "both" or - self.len_method == "ACF" or + + if (self.len_method == "both" or self.len_method == "ACF" or self.len_method == "Max ACF"): - # 2.1-- Compute Autorrelation per object + + # 2.1 -- Compute Autorrelation per object acf_val, acf_confint = acf(X_c[idx], - nlags=len(X_c[idx])-1, alpha=.05) - prev_acf=0 - for j, conf in enumerate(acf_confint): + nlags=len(X_c[idx])-1, alpha=.05) + prev_acf = 0 + for j in range(len(acf_confint)): - if(3<=j and (0 < acf_confint[j][0] <= acf_confint[j][1] or - acf_confint[j][0] <= acf_confint[j][1] < 0) ): + if(3 <= j and (0 < acf_confint[j][0] <= acf_confint[j][1] or + acf_confint[j][0] <= acf_confint[j][1] < 0)): # Consider just the maximum ACF value - if prev_acf!=0 and self.len_method == "Max ACF": + if prev_acf != 0 and self.len_method == "Max ACF": non_zero_acf.remove(prev_acf) self._cand_length_list[ c+","+str(idx)+","+str(rep) @@ -212,8 +212,8 @@ def _fit(self, X, y): self._cand_length_list[ c+","+str(idx)+","+str(rep) ].append(j) - prev_acf=j - + prev_acf = j + non_zero_pacf = [] if (self.len_method == "both" or self.len_method == "PACF" or self.len_method == "Max PACF"): @@ -222,12 +222,12 @@ def _fit(self, X, y): nlags=(len(X_c[idx])//2) - 1, alpha = .05) prev_pacf = 0 - for j, conf in enumerate(pacf_confint): + for j in range(len(pacf_confint)): if(3<=j and (0 < pacf_confint[j][0] <= pacf_confint[j][1] or - pacf_confint[j][0] <= pacf_confint[j][1] < 0) ): + pacf_confint[j][0] <= pacf_confint[j][1] < 0)): # Consider just the maximum PACF value - if prev_pacf!=0 and self.len_method == "Max PACF": + if prev_pacf != 0 and self.len_method == "Max PACF": non_zero_pacf.remove(prev_pacf) self._cand_length_list[ c+","+str(idx)+","+str(rep) @@ -282,12 +282,12 @@ def _fit(self, X, y): limit_rpoint = len(X_c[idx])-max_shp_length+1 rand_point_ts = self._random_state.choice( len(X_c[idx])-max_shp_length+1, limit_rpoint, - p = weights, replace = False) + p=weights, replace=False) else: rand_point_ts = self._random_state.choice( len(X_c[idx])-max_shp_length+1, self.n_random_points, - p = weights, replace = False) + p=weights, replace=False) for i in rand_point_ts: # 2.6-- Extract the subsequence with that point @@ -300,17 +300,17 @@ def _fit(self, X, y): self._kernels_generators[c].extend(X_c[idx].reshape(1, -1)) # 3--save the calculated subsequences - n_kernels = len (self._kernel_orig) + n_kernels = len(self._kernel_orig) self._kernels = np.full( - (n_kernels, m_kernel), dtype = np.float32, fill_value = np.nan) + (n_kernels, m_kernel), dtype=np.float32, fill_value=np.nan) for k, kernel in enumerate(self._kernel_orig): self._kernels[k, :len(kernel)] = z_normalise_series(kernel) return self - def _transform(self, X, y = None): + def _transform(self, X, y=None): """Transform the input X using the generated subsequences. Parameters ---------- From 0c21b5f366ba02675dfd18b94a047cf18b0a7b43 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 14:45:58 +0200 Subject: [PATCH 13/38] corrected identation --- .../collection/shapelet_based/_rsast.py | 29 ++++++++----------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index abc766a1c7..75b7a31a1f 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -1,15 +1,12 @@ import numpy as np from numba import get_num_threads, njit, prange, set_num_threads - from aeon.transformations.collection import BaseCollectionTransformer from aeon.utils.numba.general import z_normalise_series from aeon.utils.validation import check_n_jobs - from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning from statsmodels.tsa.stattools import acf, pacf import pandas as pd - @njit(fastmath=False) def _apply_kernel(ts, arr): d_best = np.inf # sdist @@ -19,12 +16,10 @@ def _apply_kernel(ts, arr): kernel_len = kernel.shape[0] for i in range(m - kernel_len + 1): d = np.sum((z_normalise_series(ts[i : i + kernel_len]) - kernel) ** 2) - if d < d_best: + if d < d_best: d_best = d - return d_best - @njit(parallel=True, fastmath=True) def _apply_kernels(X, kernels): nbk = len(kernels) @@ -103,7 +98,8 @@ def __init__( len_method="both", nb_inst_per_class=10, seed=None, - n_jobs=-1,): + n_jobs=-1, + ): self.n_random_points = n_random_points self.len_method = len_method self.nb_inst_per_class = nb_inst_per_class @@ -113,7 +109,6 @@ def __init__( self._cand_length_list = {} self._kernel_orig = [] self._kernels_generators = {} # Reference time series - super().__init__() def _fit(self, X, y): @@ -191,10 +186,11 @@ def _fit(self, X, y): self._cand_length_list[c+","+str(idx)+","+str(rep)] = [] non_zero_acf = [] - if (self.len_method == "both" or self.len_method == "ACF" or - self.len_method == "Max ACF"): - - # 2.1 -- Compute Autorrelation per object + if ( + self.len_method == "both" or + self.len_method == "ACF" or self.len_method == "Max ACF" + ): + # 2.1 -- Compute Autorrelation per object acf_val, acf_confint = acf(X_c[idx], nlags=len(X_c[idx])-1, alpha=.05) prev_acf = 0 @@ -218,12 +214,11 @@ def _fit(self, X, y): if (self.len_method == "both" or self.len_method == "PACF" or self.len_method == "Max PACF"): # 2.2 Compute Partial Autorrelation per object - pacf_val, pacf_confint = pacf(X_c[idx], method = "ols", + pacf_val, pacf_confint = pacf(X_c[idx], method="ols", nlags=(len(X_c[idx])//2) - 1, - alpha = .05) + alpha=.05) prev_pacf = 0 for j in range(len(pacf_confint)): - if(3<=j and (0 < pacf_confint[j][0] <= pacf_confint[j][1] or pacf_confint[j][0] <= pacf_confint[j][1] < 0)): # Consider just the maximum PACF value @@ -263,7 +258,7 @@ def _fit(self, X, y): # 2.4-- Choose randomly n_random_points point for a TS # 2.5-- calculate the weights of probabilities for a random point # in a TS - if sum(n) == 0 : + if sum(n) == 0: # Determine equal weights of a random point point in TS is # there are no significant points weights = [1/len(n) for i in range(len(n))] @@ -276,7 +271,7 @@ def _fit(self, X, y): weights = weights[:len(X_c[idx])-max_shp_length +1]/np.sum( weights[:len(X_c[idx])-max_shp_length+1]) - if self.n_random_points > len(X_c[idx])-max_shp_length+1 : + if self.n_random_points > len(X_c[idx])-max_shp_length+1: # set a upper limit for the posible of number of random # points when selecting without replacement limit_rpoint = len(X_c[idx])-max_shp_length+1 From 54dafa84a3f30510d2d38a1467323af54ca016f4 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 15:04:19 +0200 Subject: [PATCH 14/38] updated identation --- .../collection/shapelet_based/_rsast.py | 56 +++++++++---------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 75b7a31a1f..c601a4dae7 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -20,6 +20,7 @@ def _apply_kernel(ts, arr): d_best = d return d_best + @njit(parallel=True, fastmath=True) def _apply_kernels(X, kernels): nbk = len(kernels) @@ -183,39 +184,37 @@ def _fit(self, X, y): self._kernels_generators[c] = [] for rep, idx in enumerate(choosen): - self._cand_length_list[c+","+str(idx)+","+str(rep)] = [] + self._cand_length_list[c + "," + str(idx) + "," + str(rep)] = [] non_zero_acf = [] - if ( - self.len_method == "both" or - self.len_method == "ACF" or self.len_method == "Max ACF" - ): + if (self.len_method == "both" or + self.len_method == "ACF" or self.len_method == "Max ACF"): # 2.1 -- Compute Autorrelation per object acf_val, acf_confint = acf(X_c[idx], nlags=len(X_c[idx])-1, alpha=.05) prev_acf = 0 for j in range(len(acf_confint)): - if(3 <= j and (0 < acf_confint[j][0] <= acf_confint[j][1] or acf_confint[j][0] <= acf_confint[j][1] < 0)): # Consider just the maximum ACF value if prev_acf != 0 and self.len_method == "Max ACF": non_zero_acf.remove(prev_acf) self._cand_length_list[ - c+","+str(idx)+","+str(rep) + c + "," + str(idx) + "," + str(rep) ].remove(prev_acf) non_zero_acf.append(j) self._cand_length_list[ - c+","+str(idx)+","+str(rep) + c + "," + str(idx) + "," + str(rep) ].append(j) prev_acf = j non_zero_pacf = [] + if (self.len_method == "both" or self.len_method == "PACF" or self.len_method == "Max PACF"): # 2.2 Compute Partial Autorrelation per object pacf_val, pacf_confint = pacf(X_c[idx], method="ols", - nlags=(len(X_c[idx])//2) - 1, + nlags=(len(X_c[idx]) // 2) - 1, alpha=.05) prev_pacf = 0 for j in range(len(pacf_confint)): @@ -225,35 +224,35 @@ def _fit(self, X, y): if prev_pacf != 0 and self.len_method == "Max PACF": non_zero_pacf.remove(prev_pacf) self._cand_length_list[ - c+","+str(idx)+","+str(rep) + c + "," + str(idx)+"," + str(rep) ].remove(prev_pacf) non_zero_pacf.append(j) self._cand_length_list[ - c+","+str(idx)+","+str(rep) + c + "," + str(idx) + "," + str(rep) ].append(j) - prev_pacf=j + prev_pacf = j if (self.len_method == "all"): self._cand_length_list[ - c+","+str(idx)+","+str(rep) - ].extend(np.arange(3, 1+ len(X_c[idx]))) + c + ","+str(idx) + "," + str(rep) + ].extend(np.arange(3, 1 + len(X_c[idx]))) # 2.3-- Save the maximum autocorralated lag value as shapelet lenght - if len(self._cand_length_list[c+","+str(idx)+","+str(rep)]) == 0: + if len(self._cand_length_list[c + "," + str(idx) + "," + str(rep)]) == 0: # chose a random lenght using the lenght of the time series # (added 1 since the range start in 0) - rand_value = self._random_state.choice(len(X_c[idx]), 1)[0]+1 + rand_value = self._random_state.choice(len(X_c[idx]), 1)[0] + 1 self._cand_length_list[ - c+","+str(idx)+","+str(rep) + c + "," + str(idx) + "," + str(rep) ].extend([max(3, rand_value)]) self._cand_length_list[ - c+","+str(idx)+","+str(rep) - ] = list(set(self._cand_length_list[c+","+str(idx)+","+str(rep)])) + c + "," + str(idx) + "," + str(rep) + ] = list(set(self._cand_length_list[c + "," + str(idx) + "," + str(rep)])) for max_shp_length in self._cand_length_list[ - c+","+str(idx)+","+str(rep) + c + ","+str(idx) + "," + str(rep) ]: # 2.4-- Choose randomly n_random_points point for a TS # 2.5-- calculate the weights of probabilities for a random point @@ -262,31 +261,30 @@ def _fit(self, X, y): # Determine equal weights of a random point point in TS is # there are no significant points weights = [1/len(n) for i in range(len(n))] - weights = weights[:len(X_c[idx])-max_shp_length +1]/np.sum( - weights[:len(X_c[idx])-max_shp_length+1]) + weights = weights[:len(X_c[idx]) - max_shp_length + 1]/np.sum( + weights[:len(X_c[idx]) - max_shp_length + 1]) else: # Determine the weights of a random point point in TS # (excluding points after n-l+1) weights = n / np.sum(n) - weights = weights[:len(X_c[idx])-max_shp_length +1]/np.sum( - weights[:len(X_c[idx])-max_shp_length+1]) + weights = weights[:len(X_c[idx]) - max_shp_length + 1]/np.sum( + weights[:len(X_c[idx]) - max_shp_length + 1]) if self.n_random_points > len(X_c[idx])-max_shp_length+1: # set a upper limit for the posible of number of random # points when selecting without replacement - limit_rpoint = len(X_c[idx])-max_shp_length+1 + limit_rpoint = len(X_c[idx]) - max_shp_length + 1 rand_point_ts = self._random_state.choice( - len(X_c[idx])-max_shp_length+1, limit_rpoint, + len(X_c[idx]) - max_shp_length + 1, limit_rpoint, p=weights, replace=False) - else: rand_point_ts = self._random_state.choice( - len(X_c[idx])-max_shp_length+1, self.n_random_points, + len(X_c[idx]) - max_shp_length + 1, self.n_random_points, p=weights, replace=False) for i in rand_point_ts: # 2.6-- Extract the subsequence with that point - kernel = X_c[idx][i:i+max_shp_length].reshape(1, -1).copy() + kernel = X_c[idx][i : i + max_shp_length].reshape(1, -1).copy() if m_kernel < max_shp_length: m_kernel = max_shp_length From a1d1ecea972522446e25c0a2e5f95f51ff063a71 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 15:12:49 +0200 Subject: [PATCH 15/38] updated identation --- .../collection/shapelet_based/_rsast.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index c601a4dae7..6b34d50654 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -187,14 +187,17 @@ def _fit(self, X, y): self._cand_length_list[c + "," + str(idx) + "," + str(rep)] = [] non_zero_acf = [] - if (self.len_method == "both" or - self.len_method == "ACF" or self.len_method == "Max ACF"): + if ( + self.len_method == "both" or + self.len_method == "ACF" or + self.len_method == "Max ACF" + ): # 2.1 -- Compute Autorrelation per object acf_val, acf_confint = acf(X_c[idx], nlags=len(X_c[idx])-1, alpha=.05) prev_acf = 0 for j in range(len(acf_confint)): - if(3 <= j and (0 < acf_confint[j][0] <= acf_confint[j][1] or + if (3 <= j and (0 < acf_confint[j][0] <= acf_confint[j][1] or acf_confint[j][0] <= acf_confint[j][1] < 0)): # Consider just the maximum ACF value if prev_acf != 0 and self.len_method == "Max ACF": @@ -211,14 +214,16 @@ def _fit(self, X, y): non_zero_pacf = [] if (self.len_method == "both" or - self.len_method == "PACF" or self.len_method == "Max PACF"): + self.len_method == "PACF" or + self.len_method == "Max PACF" + ): # 2.2 Compute Partial Autorrelation per object pacf_val, pacf_confint = pacf(X_c[idx], method="ols", nlags=(len(X_c[idx]) // 2) - 1, alpha=.05) prev_pacf = 0 for j in range(len(pacf_confint)): - if(3<=j and (0 < pacf_confint[j][0] <= pacf_confint[j][1] or + if (3 <= j and (0 < pacf_confint[j][0] <= pacf_confint[j][1] or pacf_confint[j][0] <= pacf_confint[j][1] < 0)): # Consider just the maximum PACF value if prev_pacf != 0 and self.len_method == "Max PACF": @@ -239,7 +244,8 @@ def _fit(self, X, y): ].extend(np.arange(3, 1 + len(X_c[idx]))) # 2.3-- Save the maximum autocorralated lag value as shapelet lenght - if len(self._cand_length_list[c + "," + str(idx) + "," + str(rep)]) == 0: + if len(self._cand_length_list[ + c + "," + str(idx) + "," + str(rep)]) == 0: # chose a random lenght using the lenght of the time series # (added 1 since the range start in 0) rand_value = self._random_state.choice(len(X_c[idx]), 1)[0] + 1 @@ -249,7 +255,8 @@ def _fit(self, X, y): self._cand_length_list[ c + "," + str(idx) + "," + str(rep) - ] = list(set(self._cand_length_list[c + "," + str(idx) + "," + str(rep)])) + ] = list(set(self._cand_length_list[ + c + "," + str(idx) + "," + str(rep)])) for max_shp_length in self._cand_length_list[ c + ","+str(idx) + "," + str(rep) From 7bc3df10a07ea07189bb2d60958ac984baacf52a Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 15:19:25 +0200 Subject: [PATCH 16/38] updated identation --- .../collection/shapelet_based/_rsast.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 6b34d50654..498b19d35c 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -197,8 +197,10 @@ def _fit(self, X, y): nlags=len(X_c[idx])-1, alpha=.05) prev_acf = 0 for j in range(len(acf_confint)): - if (3 <= j and (0 < acf_confint[j][0] <= acf_confint[j][1] or - acf_confint[j][0] <= acf_confint[j][1] < 0)): + if ( + 3 <= j and + (0 < acf_confint[j][0] <= acf_confint[j][1] or + acf_confint[j][0] <= acf_confint[j][1] < 0)): # Consider just the maximum ACF value if prev_acf != 0 and self.len_method == "Max ACF": non_zero_acf.remove(prev_acf) @@ -215,16 +217,17 @@ def _fit(self, X, y): if (self.len_method == "both" or self.len_method == "PACF" or - self.len_method == "Max PACF" - ): + self.len_method == "Max PACF"): # 2.2 Compute Partial Autorrelation per object pacf_val, pacf_confint = pacf(X_c[idx], method="ols", nlags=(len(X_c[idx]) // 2) - 1, alpha=.05) prev_pacf = 0 for j in range(len(pacf_confint)): - if (3 <= j and (0 < pacf_confint[j][0] <= pacf_confint[j][1] or - pacf_confint[j][0] <= pacf_confint[j][1] < 0)): + if ( + 3 <= j and + (0 < pacf_confint[j][0] <= pacf_confint[j][1] or + pacf_confint[j][0] <= pacf_confint[j][1] < 0)): # Consider just the maximum PACF value if prev_pacf != 0 and self.len_method == "Max PACF": non_zero_pacf.remove(prev_pacf) @@ -244,8 +247,7 @@ def _fit(self, X, y): ].extend(np.arange(3, 1 + len(X_c[idx]))) # 2.3-- Save the maximum autocorralated lag value as shapelet lenght - if len(self._cand_length_list[ - c + "," + str(idx) + "," + str(rep)]) == 0: + if len(self._cand_length_list[c + "," + str(idx) + "," + str(rep)]) == 0: # chose a random lenght using the lenght of the time series # (added 1 since the range start in 0) rand_value = self._random_state.choice(len(X_c[idx]), 1)[0] + 1 From f86c4654486666b596d428c3e87e126bd87fd805 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 15:24:59 +0200 Subject: [PATCH 17/38] excluded max acf and max pacf --- .../collection/shapelet_based/_rsast.py | 56 +++++++------------ 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 498b19d35c..b0d5c57feb 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -189,57 +189,43 @@ def _fit(self, X, y): if ( self.len_method == "both" or - self.len_method == "ACF" or - self.len_method == "Max ACF" - ): + self.len_method == "ACF"): # 2.1 -- Compute Autorrelation per object acf_val, acf_confint = acf(X_c[idx], nlags=len(X_c[idx])-1, alpha=.05) - prev_acf = 0 + for j in range(len(acf_confint)): if ( 3 <= j and (0 < acf_confint[j][0] <= acf_confint[j][1] or acf_confint[j][0] <= acf_confint[j][1] < 0)): - # Consider just the maximum ACF value - if prev_acf != 0 and self.len_method == "Max ACF": - non_zero_acf.remove(prev_acf) - self._cand_length_list[ - c + "," + str(idx) + "," + str(rep) - ].remove(prev_acf) + non_zero_acf.append(j) self._cand_length_list[ c + "," + str(idx) + "," + str(rep) ].append(j) - prev_acf = j + non_zero_pacf = [] if (self.len_method == "both" or - self.len_method == "PACF" or - self.len_method == "Max PACF"): - # 2.2 Compute Partial Autorrelation per object - pacf_val, pacf_confint = pacf(X_c[idx], method="ols", - nlags=(len(X_c[idx]) // 2) - 1, - alpha=.05) - prev_pacf = 0 - for j in range(len(pacf_confint)): - if ( - 3 <= j and - (0 < pacf_confint[j][0] <= pacf_confint[j][1] or - pacf_confint[j][0] <= pacf_confint[j][1] < 0)): - # Consider just the maximum PACF value - if prev_pacf != 0 and self.len_method == "Max PACF": - non_zero_pacf.remove(prev_pacf) - self._cand_length_list[ - c + "," + str(idx)+"," + str(rep) - ].remove(prev_pacf) - - non_zero_pacf.append(j) - self._cand_length_list[ - c + "," + str(idx) + "," + str(rep) - ].append(j) - prev_pacf = j + self.len_method == "PACF"): + # 2.2 Compute Partial Autorrelation per object + pacf_val, pacf_confint = pacf(X_c[idx], method="ols", + nlags=(len(X_c[idx]) // 2) - 1, + alpha=.05) + + for j in range(len(pacf_confint)): + if ( + 3 <= j and + (0 < pacf_confint[j][0] <= pacf_confint[j][1] or + pacf_confint[j][0] <= pacf_confint[j][1] < 0)): + + non_zero_pacf.append(j) + self._cand_length_list[ + c + "," + str(idx) + "," + str(rep) + ].append(j) + if (self.len_method == "all"): self._cand_length_list[ From 2d278ec9ce49b0976eaf3656730cc7d598474c0f Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 15:28:58 +0200 Subject: [PATCH 18/38] updated identation --- .../collection/shapelet_based/_rsast.py | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index b0d5c57feb..f16232be98 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -185,29 +185,27 @@ def _fit(self, X, y): for rep, idx in enumerate(choosen): self._cand_length_list[c + "," + str(idx) + "," + str(rep)] = [] + non_zero_acf = [] - if ( self.len_method == "both" or self.len_method == "ACF"): - # 2.1 -- Compute Autorrelation per object - acf_val, acf_confint = acf(X_c[idx], - nlags=len(X_c[idx])-1, alpha=.05) - - for j in range(len(acf_confint)): - if ( - 3 <= j and - (0 < acf_confint[j][0] <= acf_confint[j][1] or - acf_confint[j][0] <= acf_confint[j][1] < 0)): - - non_zero_acf.append(j) - self._cand_length_list[ - c + "," + str(idx) + "," + str(rep) - ].append(j) + # 2.1 -- Compute Autorrelation per object + acf_val, acf_confint = acf(X_c[idx], + nlags=len(X_c[idx])-1, alpha=.05) + + for j in range(len(acf_confint)): + if ( + 3 <= j and + (0 < acf_confint[j][0] <= acf_confint[j][1] or + acf_confint[j][0] <= acf_confint[j][1] < 0)): + non_zero_acf.append(j) + self._cand_length_list[ + c + "," + str(idx) + "," + str(rep) + ].append(j) non_zero_pacf = [] - if (self.len_method == "both" or self.len_method == "PACF"): # 2.2 Compute Partial Autorrelation per object @@ -220,7 +218,7 @@ def _fit(self, X, y): 3 <= j and (0 < pacf_confint[j][0] <= pacf_confint[j][1] or pacf_confint[j][0] <= pacf_confint[j][1] < 0)): - + non_zero_pacf.append(j) self._cand_length_list[ c + "," + str(idx) + "," + str(rep) From 246b0ad7abc7f75d7f49452c7e01d9f2980e71a0 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 15:47:54 +0200 Subject: [PATCH 19/38] updated identation --- .../collection/shapelet_based/_rsast.py | 62 ++++++++----------- 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index f16232be98..b877e5432f 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -187,44 +187,36 @@ def _fit(self, X, y): self._cand_length_list[c + "," + str(idx) + "," + str(rep)] = [] non_zero_acf = [] - if ( - self.len_method == "both" or - self.len_method == "ACF"): - # 2.1 -- Compute Autorrelation per object - acf_val, acf_confint = acf(X_c[idx], - nlags=len(X_c[idx])-1, alpha=.05) - - for j in range(len(acf_confint)): - if ( - 3 <= j and - (0 < acf_confint[j][0] <= acf_confint[j][1] or - acf_confint[j][0] <= acf_confint[j][1] < 0)): - - non_zero_acf.append(j) - self._cand_length_list[ - c + "," + str(idx) + "," + str(rep) - ].append(j) + if (self.len_method == "both" or self.len_method == "ACF"): + # 2.1 -- Compute Autorrelation per object + acf_val, acf_confint = acf(X_c[idx], + nlags=len(X_c[idx])-1, alpha=.05) + + for j in range(len(acf_confint)): + if (3 <= j and + (0 < acf_confint[j][0] <= acf_confint[j][1] or + acf_confint[j][0] <= acf_confint[j][1] < 0)): + non_zero_acf.append(j) + self._cand_length_list[ + c + "," + str(idx) + "," + str(rep) + ].append(j) non_zero_pacf = [] - if (self.len_method == "both" or - self.len_method == "PACF"): - # 2.2 Compute Partial Autorrelation per object - pacf_val, pacf_confint = pacf(X_c[idx], method="ols", - nlags=(len(X_c[idx]) // 2) - 1, - alpha=.05) - - for j in range(len(pacf_confint)): - if ( - 3 <= j and - (0 < pacf_confint[j][0] <= pacf_confint[j][1] or - pacf_confint[j][0] <= pacf_confint[j][1] < 0)): - - non_zero_pacf.append(j) - self._cand_length_list[ - c + "," + str(idx) + "," + str(rep) - ].append(j) + if (self.len_method == "both" or self.len_method == "PACF"): + # 2.2 Compute Partial Autorrelation per object + pacf_val, pacf_confint = pacf(X_c[idx], method="ols", + nlags=(len(X_c[idx]) // 2) - 1, + alpha=.05) + + for j in range(len(pacf_confint)): + if (3 <= j and + (0 < pacf_confint[j][0] <= pacf_confint[j][1] or + pacf_confint[j][0] <= pacf_confint[j][1] < 0)): + non_zero_pacf.append(j) + self._cand_length_list[ + c + "," + str(idx) + "," + str(rep) + ].append(j) - if (self.len_method == "all"): self._cand_length_list[ c + ","+str(idx) + "," + str(rep) From ca59b2c71b722b5f6f6b8448d91df4efea5e9ad1 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 15:56:07 +0200 Subject: [PATCH 20/38] updated identation --- .../collection/shapelet_based/_rsast.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index b877e5432f..dbad1e5561 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -189,8 +189,9 @@ def _fit(self, X, y): non_zero_acf = [] if (self.len_method == "both" or self.len_method == "ACF"): # 2.1 -- Compute Autorrelation per object - acf_val, acf_confint = acf(X_c[idx], - nlags=len(X_c[idx])-1, alpha=.05) + acf_val, acf_confint = acf( + X_c[idx], nlags=len(X_c[idx]) - 1, + alpha=.05) for j in range(len(acf_confint)): if (3 <= j and @@ -204,9 +205,9 @@ def _fit(self, X, y): non_zero_pacf = [] if (self.len_method == "both" or self.len_method == "PACF"): # 2.2 Compute Partial Autorrelation per object - pacf_val, pacf_confint = pacf(X_c[idx], method="ols", - nlags=(len(X_c[idx]) // 2) - 1, - alpha=.05) + pacf_val, pacf_confint = pacf( + X_c[idx], method="ols", nlags=(len(X_c[idx]) // 2) - 1, + alpha=.05) for j in range(len(pacf_confint)): if (3 <= j and @@ -237,8 +238,7 @@ def _fit(self, X, y): c + "," + str(idx) + "," + str(rep)])) for max_shp_length in self._cand_length_list[ - c + ","+str(idx) + "," + str(rep) - ]: + c + ","+str(idx) + "," + str(rep)]: # 2.4-- Choose randomly n_random_points point for a TS # 2.5-- calculate the weights of probabilities for a random point # in a TS From 52a1d3372319b49982a79d4fd9eccf001b99b19a Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 16:06:27 +0200 Subject: [PATCH 21/38] updated identation --- .../collection/shapelet_based/_rsast.py | 32 +++++++------------ 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index dbad1e5561..2957af0699 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -175,6 +175,8 @@ def _fit(self, X, y): # 2--calculate PACF and ACF for each TS chossen in each class for i, c in enumerate(classes): + + idx_len_list = c + ","+str(idx) + "," + str(rep) X_c = X_[y == c] cnt = np.min([self.nb_inst_per_class, X_c.shape[0]]).astype(int) @@ -184,7 +186,7 @@ def _fit(self, X, y): self._kernels_generators[c] = [] for rep, idx in enumerate(choosen): - self._cand_length_list[c + "," + str(idx) + "," + str(rep)] = [] + self._cand_length_list[idx_len_list] = [] non_zero_acf = [] if (self.len_method == "both" or self.len_method == "ACF"): @@ -198,9 +200,7 @@ def _fit(self, X, y): (0 < acf_confint[j][0] <= acf_confint[j][1] or acf_confint[j][0] <= acf_confint[j][1] < 0)): non_zero_acf.append(j) - self._cand_length_list[ - c + "," + str(idx) + "," + str(rep) - ].append(j) + self._cand_length_list[idx_len_list].append(j) non_zero_pacf = [] if (self.len_method == "both" or self.len_method == "PACF"): @@ -214,31 +214,23 @@ def _fit(self, X, y): (0 < pacf_confint[j][0] <= pacf_confint[j][1] or pacf_confint[j][0] <= pacf_confint[j][1] < 0)): non_zero_pacf.append(j) - self._cand_length_list[ - c + "," + str(idx) + "," + str(rep) - ].append(j) + self._cand_length_list[idx_len_list].append(j) if (self.len_method == "all"): - self._cand_length_list[ - c + ","+str(idx) + "," + str(rep) - ].extend(np.arange(3, 1 + len(X_c[idx]))) + self._cand_length_list[idx_len_list].extend( + np.arange(3, 1 + len(X_c[idx]))) # 2.3-- Save the maximum autocorralated lag value as shapelet lenght - if len(self._cand_length_list[c + "," + str(idx) + "," + str(rep)]) == 0: + if len(self._cand_length_list[idx_len_list]) == 0: # chose a random lenght using the lenght of the time series # (added 1 since the range start in 0) rand_value = self._random_state.choice(len(X_c[idx]), 1)[0] + 1 - self._cand_length_list[ - c + "," + str(idx) + "," + str(rep) - ].extend([max(3, rand_value)]) + self._cand_length_list[idx_len_list].extend([max(3, rand_value)]) - self._cand_length_list[ - c + "," + str(idx) + "," + str(rep) - ] = list(set(self._cand_length_list[ - c + "," + str(idx) + "," + str(rep)])) + self._cand_length_list[idx_len_list] = list(set( + self._cand_length_list[idx_len_list])) - for max_shp_length in self._cand_length_list[ - c + ","+str(idx) + "," + str(rep)]: + for max_shp_length in self._cand_length_list[idx_len_list]: # 2.4-- Choose randomly n_random_points point for a TS # 2.5-- calculate the weights of probabilities for a random point # in a TS From 2f7e3b45a2330753a1cebb0107513549433a9b6c Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 16:13:44 +0200 Subject: [PATCH 22/38] updated identation --- aeon/transformations/collection/shapelet_based/_rsast.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 2957af0699..d676d89055 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -91,6 +91,7 @@ class RSAST(BaseCollectionTransformer): "output_data_type": "Tabular", "capability:multivariate": False, "algorithm_type": "subsequence", + "python_dependencies": "statsmodels", } def __init__( @@ -176,7 +177,6 @@ def _fit(self, X, y): for i, c in enumerate(classes): - idx_len_list = c + ","+str(idx) + "," + str(rep) X_c = X_[y == c] cnt = np.min([self.nb_inst_per_class, X_c.shape[0]]).astype(int) @@ -186,6 +186,9 @@ def _fit(self, X, y): self._kernels_generators[c] = [] for rep, idx in enumerate(choosen): + + idx_len_list = c + ","+str(idx) + "," + str(rep) # defining indices for length list + self._cand_length_list[idx_len_list] = [] non_zero_acf = [] From 395ece6c4821e6970fbc83ab928f25c72b7b55f1 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 16:22:22 +0200 Subject: [PATCH 23/38] updated identation --- aeon/transformations/collection/shapelet_based/_rsast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index d676d89055..9e74a7ebf5 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -186,8 +186,8 @@ def _fit(self, X, y): self._kernels_generators[c] = [] for rep, idx in enumerate(choosen): - - idx_len_list = c + ","+str(idx) + "," + str(rep) # defining indices for length list + # defining indices for length list + idx_len_list = c + ","+str(idx) + "," + str(rep) self._cand_length_list[idx_len_list] = [] From 846472f719ee91f7695a2da7ed46d2147e107515 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 16:22:42 +0200 Subject: [PATCH 24/38] updated identation --- aeon/classification/shapelet_based/_rsast_classifier.py | 1 + aeon/transformations/collection/shapelet_based/_rsast.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index 2bbaef9eae..925691c6ed 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -61,6 +61,7 @@ class RSASTClassifier(BaseClassifier): "capability:multithreading": True, "capability:multivariate": False, "algorithm_type": "subsequence", + "python_dependencies": ["statsmodels"], } def __init__( diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 9e74a7ebf5..ac47e9b0ba 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -91,7 +91,7 @@ class RSAST(BaseCollectionTransformer): "output_data_type": "Tabular", "capability:multivariate": False, "algorithm_type": "subsequence", - "python_dependencies": "statsmodels", + "python_dependencies": ["statsmodels"], } def __init__( From 013af532e7446b23f6568466c820a968d4fd7240 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Mon, 8 Apr 2024 16:27:33 +0200 Subject: [PATCH 25/38] update packages --- aeon/classification/shapelet_based/_rsast_classifier.py | 2 +- aeon/transformations/collection/shapelet_based/_rsast.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index 925691c6ed..0aa35b7b7d 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -61,7 +61,7 @@ class RSASTClassifier(BaseClassifier): "capability:multithreading": True, "capability:multivariate": False, "algorithm_type": "subsequence", - "python_dependencies": ["statsmodels"], + "python_dependencies": "statsmodels", } def __init__( diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index ac47e9b0ba..8f2aea9b69 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -91,7 +91,6 @@ class RSAST(BaseCollectionTransformer): "output_data_type": "Tabular", "capability:multivariate": False, "algorithm_type": "subsequence", - "python_dependencies": ["statsmodels"], } def __init__( From b7ad0a77e6397974ed0c01ad8c60e8d50a7b6caf Mon Sep 17 00:00:00 2001 From: Nicolas Rojas Varela Date: Sat, 13 Apr 2024 23:07:05 +0200 Subject: [PATCH 26/38] included tag in transformer --- aeon/classification/shapelet_based/_rsast_classifier.py | 4 ++-- aeon/transformations/collection/shapelet_based/_rsast.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index 0aa35b7b7d..766d739e6a 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -17,7 +17,7 @@ from aeon.classification import BaseClassifier from aeon.transformations.collection.shapelet_based import RSAST from aeon.utils.numba.general import z_normalise_series -import matplotlib.pyplot as plt + class RSASTClassifier(BaseClassifier): @@ -159,7 +159,7 @@ def _predict_proba(self, X): return dists def plot_most_important_feature_on_ts(self, ts, feature_importance, limit=5): - + import matplotlib.pyplot as plt """Plot the most important features on ts. Parameters diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 8f2aea9b69..42735d4086 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -3,8 +3,8 @@ from aeon.transformations.collection import BaseCollectionTransformer from aeon.utils.numba.general import z_normalise_series from aeon.utils.validation import check_n_jobs -from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning -from statsmodels.tsa.stattools import acf, pacf + + import pandas as pd @njit(fastmath=False) @@ -86,11 +86,14 @@ class RSAST(BaseCollectionTransformer): >>> X_test = rsast.transform(X_test) """ + from statsmodels.tsa.stattools import acf, pacf + from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning _tags = { "output_data_type": "Tabular", "capability:multivariate": False, "algorithm_type": "subsequence", + "python_dependencies": "statsmodels", } def __init__( From 1182a3ad6206ddcfe0c560be80e2ced77620e24e Mon Sep 17 00:00:00 2001 From: Nicolas Rojas Varela Date: Sat, 13 Apr 2024 23:11:58 +0200 Subject: [PATCH 27/38] moved the import libraries --- aeon/transformations/collection/shapelet_based/_rsast.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 42735d4086..66353daec8 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -86,9 +86,8 @@ class RSAST(BaseCollectionTransformer): >>> X_test = rsast.transform(X_test) """ - from statsmodels.tsa.stattools import acf, pacf - from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning - + + _tags = { "output_data_type": "Tabular", "capability:multivariate": False, @@ -96,6 +95,9 @@ class RSAST(BaseCollectionTransformer): "python_dependencies": "statsmodels", } + from statsmodels.tsa.stattools import acf, pacf + from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning + def __init__( self, n_random_points=10, From e1227fd1ea5db8a2e4ea08fdba9a4a49c3f66fe0 Mon Sep 17 00:00:00 2001 From: Nicolas Rojas Varela Date: Sat, 13 Apr 2024 23:23:33 +0200 Subject: [PATCH 28/38] included brackets --- aeon/classification/shapelet_based/_rsast_classifier.py | 2 +- aeon/transformations/collection/shapelet_based/_rsast.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index 766d739e6a..f85b8b9024 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -61,7 +61,7 @@ class RSASTClassifier(BaseClassifier): "capability:multithreading": True, "capability:multivariate": False, "algorithm_type": "subsequence", - "python_dependencies": "statsmodels", + "python_dependencies": ["statsmodels"], } def __init__( diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 66353daec8..56c710c602 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -92,12 +92,12 @@ class RSAST(BaseCollectionTransformer): "output_data_type": "Tabular", "capability:multivariate": False, "algorithm_type": "subsequence", - "python_dependencies": "statsmodels", + "python_dependencies": ["statsmodels"], } from statsmodels.tsa.stattools import acf, pacf from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning - + def __init__( self, n_random_points=10, From 85bd62cf1e0ab2569de5e2386a05c3b33f0f2fce Mon Sep 17 00:00:00 2001 From: Nicolas Rojas Varela Date: Sat, 13 Apr 2024 23:27:46 +0200 Subject: [PATCH 29/38] moved libraries to fit function --- aeon/transformations/collection/shapelet_based/_rsast.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 56c710c602..08bdc652f5 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -95,8 +95,7 @@ class RSAST(BaseCollectionTransformer): "python_dependencies": ["statsmodels"], } - from statsmodels.tsa.stattools import acf, pacf - from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning + def __init__( self, @@ -118,6 +117,10 @@ def __init__( super().__init__() def _fit(self, X, y): + + from statsmodels.tsa.stattools import acf, pacf + from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning + """Select reference time series and generate subsequences from them. Parameters From 5996a92118e992a3e9f4c485fbec6c03eacc2f9d Mon Sep 17 00:00:00 2001 From: Nicolas Rojas Varela Date: Sat, 13 Apr 2024 23:31:58 +0200 Subject: [PATCH 30/38] deleted spaces --- aeon/transformations/collection/shapelet_based/_rsast.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 08bdc652f5..5ae9b9c104 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -86,8 +86,6 @@ class RSAST(BaseCollectionTransformer): >>> X_test = rsast.transform(X_test) """ - - _tags = { "output_data_type": "Tabular", "capability:multivariate": False, @@ -95,8 +93,6 @@ class RSAST(BaseCollectionTransformer): "python_dependencies": ["statsmodels"], } - - def __init__( self, n_random_points=10, From c5539f605e3a243b651f0523b27105e4e0973e62 Mon Sep 17 00:00:00 2001 From: Nicolas Rojas Varela Date: Sat, 13 Apr 2024 23:43:51 +0200 Subject: [PATCH 31/38] updated spaces --- aeon/classification/shapelet_based/_rsast_classifier.py | 1 + aeon/classification/shapelet_based/_sast_classifier.py | 1 + aeon/transformations/collection/shapelet_based/_rsast.py | 5 ++--- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index f85b8b9024..08b89b75f5 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -199,3 +199,4 @@ def plot_most_important_feature_on_ts(self, ts, feature_importance, limit=5): axes[f].plot(range(start_pos, start_pos + kernel.size), kernel, linewidth=5) axes[f].plot(range(ts.size), ts, linewidth=2) axes[f].set_title(f"feature: {f+1}") + \ No newline at end of file diff --git a/aeon/classification/shapelet_based/_sast_classifier.py b/aeon/classification/shapelet_based/_sast_classifier.py index a2bdca6fad..990366e79d 100644 --- a/aeon/classification/shapelet_based/_sast_classifier.py +++ b/aeon/classification/shapelet_based/_sast_classifier.py @@ -18,6 +18,7 @@ from aeon.utils.numba.general import z_normalise_series + class SASTClassifier(BaseClassifier): """Classification pipeline using SAST [1]_ transformer and an sklean classifier. diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 5ae9b9c104..43cfaa023f 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -3,10 +3,9 @@ from aeon.transformations.collection import BaseCollectionTransformer from aeon.utils.numba.general import z_normalise_series from aeon.utils.validation import check_n_jobs - - import pandas as pd + @njit(fastmath=False) def _apply_kernel(ts, arr): d_best = np.inf # sdist @@ -86,6 +85,7 @@ class RSAST(BaseCollectionTransformer): >>> X_test = rsast.transform(X_test) """ + _tags = { "output_data_type": "Tabular", "capability:multivariate": False, @@ -312,4 +312,3 @@ def _transform(self, X, y=None): set_num_threads(prev_threads) return X_transformed - From b525d88a9c0c6f7a6acd1cc9867842e1db0cb21f Mon Sep 17 00:00:00 2001 From: Nicolas Rojas Varela Date: Sat, 13 Apr 2024 23:49:44 +0200 Subject: [PATCH 32/38] updated identation --- aeon/classification/shapelet_based/_rsast_classifier.py | 1 - aeon/transformations/collection/shapelet_based/_rsast.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index 08b89b75f5..f85b8b9024 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -199,4 +199,3 @@ def plot_most_important_feature_on_ts(self, ts, feature_importance, limit=5): axes[f].plot(range(start_pos, start_pos + kernel.size), kernel, linewidth=5) axes[f].plot(range(ts.size), ts, linewidth=2) axes[f].set_title(f"feature: {f+1}") - \ No newline at end of file diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 43cfaa023f..e9e137c344 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -111,7 +111,7 @@ def __init__( self._kernel_orig = [] self._kernels_generators = {} # Reference time series super().__init__() - + def _fit(self, X, y): from statsmodels.tsa.stattools import acf, pacf @@ -285,7 +285,7 @@ def _fit(self, X, y): self._kernels[k, :len(kernel)] = z_normalise_series(kernel) return self - + def _transform(self, X, y=None): """Transform the input X using the generated subsequences. Parameters From fa80c266e3de01002a945d31b4cd51a1ca1af4f8 Mon Sep 17 00:00:00 2001 From: Nicolas Rojas Varela Date: Sun, 14 Apr 2024 11:43:30 +0200 Subject: [PATCH 33/38] included library statmodel in rsast classifier --- aeon/classification/shapelet_based/_rsast_classifier.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index f85b8b9024..0e7bf7122f 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -97,6 +97,9 @@ def _fit(self, X, y): This pipeline classifier """ + from statsmodels.tsa.stattools import acf, pacf + from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning + self._transformer = RSAST( self.n_random_points, self.len_method, From a7fe63c76f4a073205dac63c844b8722a5f58292 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Tue, 16 Apr 2024 15:22:33 +0200 Subject: [PATCH 34/38] applied in Classifier: doctest: +SKIP --- .../shapelet_based/_rsast_classifier.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index 0e7bf7122f..f88aec0727 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -18,8 +18,6 @@ from aeon.transformations.collection.shapelet_based import RSAST from aeon.utils.numba.general import z_normalise_series - - class RSASTClassifier(BaseClassifier): """Classification pipeline using RSAST [1]_ transformer and an sklean classifier. @@ -51,10 +49,10 @@ class RSASTClassifier(BaseClassifier): >>> from aeon.datasets import load_unit_test >>> X_train, y_train = load_unit_test(split="train") >>> X_test, y_test = load_unit_test(split="test") - >>> clf = RSASTClassifier() - >>> clf.fit(X_train, y_train) + >>> clf = RSASTClassifier() # doctest: +SKIP + >>> clf.fit(X_train, y_train) # doctest: +SKIP RSASTClassifier(...) - >>> y_pred = clf.predict(X_test) + >>> y_pred = clf.predict(X_test) # doctest: +SKIP """ _tags = { @@ -97,8 +95,7 @@ def _fit(self, X, y): This pipeline classifier """ - from statsmodels.tsa.stattools import acf, pacf - from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning + self._transformer = RSAST( self.n_random_points, From 2f5ad63333cf1bec9542482bcf9f6d88d7802da8 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Tue, 16 Apr 2024 15:47:35 +0200 Subject: [PATCH 35/38] skip in # doctest: +SKIP --- aeon/transformations/collection/shapelet_based/_rsast.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index e9e137c344..255e406120 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -78,11 +78,11 @@ class RSAST(BaseCollectionTransformer): >>> from aeon.datasets import load_unit_test >>> X_train, y_train = load_unit_test(split="train") >>> X_test, y_test = load_unit_test(split="test") - >>> rsast = RSAST() - >>> rsast.fit(X_train, y_train) + >>> rsast = RSAST() # doctest: +SKIP + >>> rsast.fit(X_train, y_train) # doctest: +SKIP RSAST() - >>> X_train = rsast.transform(X_train) - >>> X_test = rsast.transform(X_test) + >>> X_train = rsast.transform(X_train) # doctest: +SKIP + >>> X_test = rsast.transform(X_test) # doctest: +SKIP """ From 859c4feb17c2be978fd326a57dae781fbe9fb8a0 Mon Sep 17 00:00:00 2001 From: nirojasva Date: Wed, 17 Apr 2024 09:07:47 +0200 Subject: [PATCH 36/38] using pre-commit --- .../classification/shapelet_based/__init__.py | 2 +- .../shapelet_based/_rsast_classifier.py | 12 +- .../shapelet_based/_sast_classifier.py | 1 - .../collection/shapelet_based/__init__.py | 2 +- .../collection/shapelet_based/_rsast.py | 176 ++++++++++-------- 5 files changed, 105 insertions(+), 88 deletions(-) diff --git a/aeon/classification/shapelet_based/__init__.py b/aeon/classification/shapelet_based/__init__.py index b687d93413..f8c45a242a 100644 --- a/aeon/classification/shapelet_based/__init__.py +++ b/aeon/classification/shapelet_based/__init__.py @@ -12,6 +12,6 @@ from aeon.classification.shapelet_based._ls import LearningShapeletClassifier from aeon.classification.shapelet_based._mrsqm import MrSQMClassifier from aeon.classification.shapelet_based._rdst import RDSTClassifier -from aeon.classification.shapelet_based._sast_classifier import SASTClassifier from aeon.classification.shapelet_based._rsast_classifier import RSASTClassifier +from aeon.classification.shapelet_based._sast_classifier import SASTClassifier from aeon.classification.shapelet_based._stc import ShapeletTransformClassifier diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index f88aec0727..78c08d92d9 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -18,14 +18,15 @@ from aeon.transformations.collection.shapelet_based import RSAST from aeon.utils.numba.general import z_normalise_series + class RSASTClassifier(BaseClassifier): """Classification pipeline using RSAST [1]_ transformer and an sklean classifier. Parameters ---------- n_random_points: int default = 10 the number of initial random points to extract - len_method: string default="both" the type of statistical tool used to get the - length of shapelets. "both"=ACF&PACF, "ACF"=ACF, "PACF"=PACF, + len_method: string default="both" the type of statistical tool used to get the + length of shapelets. "both"=ACF&PACF, "ACF"=ACF, "PACF"=PACF, "None"=Extract randomly any length from the TS nb_inst_per_class : int default = 10 the number of reference time series to select per class @@ -39,10 +40,10 @@ class RSASTClassifier(BaseClassifier): Reference --------- - .. [1] Varela, N. R., Mbouopda, M. F., & Nguifo, E. M. (2023). RSAST: Sampling + .. [1] Varela, N. R., Mbouopda, M. F., & Nguifo, E. M. (2023). RSAST: Sampling Shapelets for Time Series Classification. https://hal.science/hal-04311309/ - + Examples -------- >>> from aeon.classification.shapelet_based import RSASTClassifier @@ -95,8 +96,6 @@ def _fit(self, X, y): This pipeline classifier """ - - self._transformer = RSAST( self.n_random_points, self.len_method, @@ -160,6 +159,7 @@ def _predict_proba(self, X): def plot_most_important_feature_on_ts(self, ts, feature_importance, limit=5): import matplotlib.pyplot as plt + """Plot the most important features on ts. Parameters diff --git a/aeon/classification/shapelet_based/_sast_classifier.py b/aeon/classification/shapelet_based/_sast_classifier.py index 990366e79d..a2bdca6fad 100644 --- a/aeon/classification/shapelet_based/_sast_classifier.py +++ b/aeon/classification/shapelet_based/_sast_classifier.py @@ -18,7 +18,6 @@ from aeon.utils.numba.general import z_normalise_series - class SASTClassifier(BaseClassifier): """Classification pipeline using SAST [1]_ transformer and an sklean classifier. diff --git a/aeon/transformations/collection/shapelet_based/__init__.py b/aeon/transformations/collection/shapelet_based/__init__.py index e7851a5dbe..23990ad520 100644 --- a/aeon/transformations/collection/shapelet_based/__init__.py +++ b/aeon/transformations/collection/shapelet_based/__init__.py @@ -5,8 +5,8 @@ from aeon.transformations.collection.shapelet_based._dilated_shapelet_transform import ( RandomDilatedShapeletTransform, ) -from aeon.transformations.collection.shapelet_based._sast import SAST from aeon.transformations.collection.shapelet_based._rsast import RSAST +from aeon.transformations.collection.shapelet_based._sast import SAST from aeon.transformations.collection.shapelet_based._shapelet_transform import ( RandomShapeletTransform, ) diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 255e406120..3ce7ce0277 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -1,9 +1,10 @@ import numpy as np +import pandas as pd from numba import get_num_threads, njit, prange, set_num_threads + from aeon.transformations.collection import BaseCollectionTransformer from aeon.utils.numba.general import z_normalise_series from aeon.utils.validation import check_n_jobs -import pandas as pd @njit(fastmath=False) @@ -15,7 +16,7 @@ def _apply_kernel(ts, arr): kernel_len = kernel.shape[0] for i in range(m - kernel_len + 1): d = np.sum((z_normalise_series(ts[i : i + kernel_len]) - kernel) ** 2) - if d < d_best: + if d < d_best: d_best = d return d_best @@ -35,27 +36,27 @@ def _apply_kernels(X, kernels): class RSAST(BaseCollectionTransformer): """Random Scalable and Accurate Subsequence Transform (SAST). - RSAST [1] is based on SAST, it uses a stratified sampling strategy + RSAST [1] is based on SAST, it uses a stratified sampling strategy for subsequences selection but additionally takes into account certain - statistical criteria such as ANOVA, ACF, and PACF to further reduce + statistical criteria such as ANOVA, ACF, and PACF to further reduce the search space of shapelets. - + RSAST starts with the pre-computation of a list of weights, using ANOVA, - which helps in the selection of initial points for subsequences. Then - randomly select k time series per class, which are used with an ACF and PACF, - obtaining a set of highly correlated lagged values. These values are used as - potential lengths for the shapelets. Lastly, with a pre-defined number of - admissible starting points to sample, the shapelets are extracted and used to - transform the original dataset, replacing each time series by the vector of its + which helps in the selection of initial points for subsequences. Then + randomly select k time series per class, which are used with an ACF and PACF, + obtaining a set of highly correlated lagged values. These values are used as + potential lengths for the shapelets. Lastly, with a pre-defined number of + admissible starting points to sample, the shapelets are extracted and used to + transform the original dataset, replacing each time series by the vector of its distance to each subsequence. Parameters ---------- n_random_points: int default = 10 the number of initial random points to extract - len_method: string default="both" the type of statistical tool used to get - the length of shapelets. "both"=ACF&PACF, "ACF"=ACF, "PACF"=PACF, + len_method: string default="both" the type of statistical tool used to get + the length of shapelets. "both"=ACF&PACF, "ACF"=ACF, "PACF"=PACF, "None"=Extract randomly any length from the TS - + nb_inst_per_class : int default = 10 the number of reference time series to select per class seed : int, default = None @@ -67,7 +68,7 @@ class RSAST(BaseCollectionTransformer): Reference --------- - .. [1] Varela, N. R., Mbouopda, M. F., & Nguifo, E. M. (2023). + .. [1] Varela, N. R., Mbouopda, M. F., & Nguifo, E. M. (2023). RSAST: Sampling Shapelets for Time Series Classification. https://hal.science/hal-04311309/ @@ -113,10 +114,10 @@ def __init__( super().__init__() def _fit(self, X, y): - + + from scipy.stats import ConstantInputWarning, DegenerateDataWarning, f_oneway from statsmodels.tsa.stattools import acf, pacf - from scipy.stats import f_oneway, DegenerateDataWarning, ConstantInputWarning - + """Select reference time series and generate subsequences from them. Parameters @@ -132,20 +133,21 @@ def _fit(self, X, y): This transformer """ - + # 0- initialize variables and convert values in "y" to string X_ = np.reshape(X, (X.shape[0], X.shape[-1])) self._random_state = ( np.random.RandomState(self.seed) if not isinstance(self.seed, np.random.RandomState) - else self.seed) + else self.seed + ) classes = np.unique(y) self._num_classes = classes.shape[0] y = np.asarray([str(x_s) for x_s in y]) - + n = [] classes = np.unique(y) self.num_classes = classes.shape[0] @@ -155,10 +157,9 @@ def _fit(self, X, y): for i in range(X_.shape[1]): statistic_per_class = {} for c in classes: - assert len( - X_[ - np.where(y == c)[0] - ][:, i]) > 0, 'Time t without values in TS' + assert ( + len(X_[np.where(y == c)[0]][:, i]) > 0 + ), "Time t without values in TS" statistic_per_class[c] = X_[np.where(y == c)[0]][:, i] statistic_per_class = pd.Series(statistic_per_class) @@ -169,125 +170,142 @@ def _fit(self, X, y): p_value = np.nan # Interpretation of the results - # if p_value < 0.05: " The means of the populations are + # if p_value < 0.05: " The means of the populations are # significantly different." if np.isnan(p_value): n.append(0) else: - n.append(1-p_value) + n.append(1 - p_value) # 2--calculate PACF and ACF for each TS chossen in each class - + for i, c in enumerate(classes): - + X_c = X_[y == c] cnt = np.min([self.nb_inst_per_class, X_c.shape[0]]).astype(int) choosen = self._random_state.permutation(X_c.shape[0])[:cnt] - + self._kernels_generators[c] = [] for rep, idx in enumerate(choosen): # defining indices for length list - idx_len_list = c + ","+str(idx) + "," + str(rep) + idx_len_list = c + "," + str(idx) + "," + str(rep) self._cand_length_list[idx_len_list] = [] - + non_zero_acf = [] - if (self.len_method == "both" or self.len_method == "ACF"): + if self.len_method == "both" or self.len_method == "ACF": # 2.1 -- Compute Autorrelation per object acf_val, acf_confint = acf( - X_c[idx], nlags=len(X_c[idx]) - 1, - alpha=.05) + X_c[idx], nlags=len(X_c[idx]) - 1, alpha=0.05 + ) for j in range(len(acf_confint)): - if (3 <= j and - (0 < acf_confint[j][0] <= acf_confint[j][1] or - acf_confint[j][0] <= acf_confint[j][1] < 0)): + if 3 <= j and ( + 0 < acf_confint[j][0] <= acf_confint[j][1] + or acf_confint[j][0] <= acf_confint[j][1] < 0 + ): non_zero_acf.append(j) self._cand_length_list[idx_len_list].append(j) non_zero_pacf = [] - if (self.len_method == "both" or self.len_method == "PACF"): + if self.len_method == "both" or self.len_method == "PACF": # 2.2 Compute Partial Autorrelation per object pacf_val, pacf_confint = pacf( - X_c[idx], method="ols", nlags=(len(X_c[idx]) // 2) - 1, - alpha=.05) + X_c[idx], + method="ols", + nlags=(len(X_c[idx]) // 2) - 1, + alpha=0.05, + ) for j in range(len(pacf_confint)): - if (3 <= j and - (0 < pacf_confint[j][0] <= pacf_confint[j][1] or - pacf_confint[j][0] <= pacf_confint[j][1] < 0)): + if 3 <= j and ( + 0 < pacf_confint[j][0] <= pacf_confint[j][1] + or pacf_confint[j][0] <= pacf_confint[j][1] < 0 + ): non_zero_pacf.append(j) self._cand_length_list[idx_len_list].append(j) - - if (self.len_method == "all"): + + if self.len_method == "all": self._cand_length_list[idx_len_list].extend( - np.arange(3, 1 + len(X_c[idx]))) - + np.arange(3, 1 + len(X_c[idx])) + ) + # 2.3-- Save the maximum autocorralated lag value as shapelet lenght if len(self._cand_length_list[idx_len_list]) == 0: - # chose a random lenght using the lenght of the time series + # chose a random lenght using the lenght of the time series # (added 1 since the range start in 0) rand_value = self._random_state.choice(len(X_c[idx]), 1)[0] + 1 self._cand_length_list[idx_len_list].extend([max(3, rand_value)]) - self._cand_length_list[idx_len_list] = list(set( - self._cand_length_list[idx_len_list])) + self._cand_length_list[idx_len_list] = list( + set(self._cand_length_list[idx_len_list]) + ) for max_shp_length in self._cand_length_list[idx_len_list]: - # 2.4-- Choose randomly n_random_points point for a TS - # 2.5-- calculate the weights of probabilities for a random point + # 2.4-- Choose randomly n_random_points point for a TS + # 2.5-- calculate the weights of probabilities for a random point # in a TS if sum(n) == 0: - # Determine equal weights of a random point point in TS is + # Determine equal weights of a random point point in TS is # there are no significant points - weights = [1/len(n) for i in range(len(n))] - weights = weights[:len(X_c[idx]) - max_shp_length + 1]/np.sum( - weights[:len(X_c[idx]) - max_shp_length + 1]) - else: - # Determine the weights of a random point point in TS + weights = [1 / len(n) for i in range(len(n))] + weights = weights[ + : len(X_c[idx]) - max_shp_length + 1 + ] / np.sum(weights[: len(X_c[idx]) - max_shp_length + 1]) + else: + # Determine the weights of a random point point in TS # (excluding points after n-l+1) weights = n / np.sum(n) - weights = weights[:len(X_c[idx]) - max_shp_length + 1]/np.sum( - weights[:len(X_c[idx]) - max_shp_length + 1]) + weights = weights[ + : len(X_c[idx]) - max_shp_length + 1 + ] / np.sum(weights[: len(X_c[idx]) - max_shp_length + 1]) - if self.n_random_points > len(X_c[idx])-max_shp_length+1: - # set a upper limit for the posible of number of random + if self.n_random_points > len(X_c[idx]) - max_shp_length + 1: + # set a upper limit for the posible of number of random # points when selecting without replacement limit_rpoint = len(X_c[idx]) - max_shp_length + 1 rand_point_ts = self._random_state.choice( - len(X_c[idx]) - max_shp_length + 1, limit_rpoint, - p=weights, replace=False) + len(X_c[idx]) - max_shp_length + 1, + limit_rpoint, + p=weights, + replace=False, + ) else: rand_point_ts = self._random_state.choice( - len(X_c[idx]) - max_shp_length + 1, self.n_random_points, - p=weights, replace=False) - - for i in rand_point_ts: + len(X_c[idx]) - max_shp_length + 1, + self.n_random_points, + p=weights, + replace=False, + ) + + for i in rand_point_ts: # 2.6-- Extract the subsequence with that point kernel = X_c[idx][i : i + max_shp_length].reshape(1, -1).copy() - + if m_kernel < max_shp_length: - m_kernel = max_shp_length - + m_kernel = max_shp_length + self._kernel_orig.append(np.squeeze(kernel)) self._kernels_generators[c].extend(X_c[idx].reshape(1, -1)) - + # 3--save the calculated subsequences n_kernels = len(self._kernel_orig) - + self._kernels = np.full( - (n_kernels, m_kernel), dtype=np.float32, fill_value=np.nan) - + (n_kernels, m_kernel), dtype=np.float32, fill_value=np.nan + ) + for k, kernel in enumerate(self._kernel_orig): - self._kernels[k, :len(kernel)] = z_normalise_series(kernel) - + self._kernels[k, : len(kernel)] = z_normalise_series(kernel) + return self def _transform(self, X, y=None): """Transform the input X using the generated subsequences. + Parameters ---------- X: np.ndarray shape (n_cases, n_channels, n_timepoints) @@ -307,7 +325,7 @@ def _transform(self, X, y=None): n_jobs = check_n_jobs(self.n_jobs) set_num_threads(n_jobs) - + X_transformed = _apply_kernels(X_, self._kernels) # subsequence transform of X set_num_threads(prev_threads) From 60e68e2ae4fd5f3458b2ff19cc51b2035ac4862b Mon Sep 17 00:00:00 2001 From: nirojasva Date: Wed, 17 Apr 2024 09:07:59 +0200 Subject: [PATCH 37/38] using pre-commit --- aeon/classification/shapelet_based/_rsast_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index 78c08d92d9..1b3d736244 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -20,7 +20,7 @@ class RSASTClassifier(BaseClassifier): - """Classification pipeline using RSAST [1]_ transformer and an sklean classifier. + """Classification pipeline using RSAST [1]_ transformer and an sklearn classifier. Parameters ---------- From f4167b32be9ff0b717007366c00a59519dac5abc Mon Sep 17 00:00:00 2001 From: nirojasva Date: Wed, 24 Apr 2024 09:49:37 +0200 Subject: [PATCH 38/38] updated changes requested for PR --- .../shapelet_based/_rsast_classifier.py | 59 +++---------------- .../collection/shapelet_based/_rsast.py | 6 +- 2 files changed, 11 insertions(+), 54 deletions(-) diff --git a/aeon/classification/shapelet_based/_rsast_classifier.py b/aeon/classification/shapelet_based/_rsast_classifier.py index 1b3d736244..5b591a48fe 100644 --- a/aeon/classification/shapelet_based/_rsast_classifier.py +++ b/aeon/classification/shapelet_based/_rsast_classifier.py @@ -3,12 +3,9 @@ Pipeline classifier using the RSAST transformer and an sklearn classifier. """ -__maintainer__ = [] +__maintainer__ = ["nirojasva"] __all__ = ["RSASTClassifier"] - -from operator import itemgetter - import numpy as np from sklearn.linear_model import RidgeClassifierCV from sklearn.pipeline import make_pipeline @@ -16,11 +13,14 @@ from aeon.base._base import _clone_estimator from aeon.classification import BaseClassifier from aeon.transformations.collection.shapelet_based import RSAST -from aeon.utils.numba.general import z_normalise_series class RSASTClassifier(BaseClassifier): - """Classification pipeline using RSAST [1]_ transformer and an sklearn classifier. + """RSASTClassifier. + + Classification pipeline using + Random Scalable and Accurate Subsequence Transform (RSAST) [1]_ transformer + and an sklearn classifier. Parameters ---------- @@ -59,8 +59,8 @@ class RSASTClassifier(BaseClassifier): _tags = { "capability:multithreading": True, "capability:multivariate": False, - "algorithm_type": "subsequence", - "python_dependencies": ["statsmodels"], + "algorithm_type": "shapelet", + "python_dependencies": "statsmodels", } def __init__( @@ -156,46 +156,3 @@ def _predict_proba(self, X): for i in range(0, X.shape[0]): dists[i, np.where(self.classes_ == preds[i])] = 1 return dists - - def plot_most_important_feature_on_ts(self, ts, feature_importance, limit=5): - import matplotlib.pyplot as plt - - """Plot the most important features on ts. - - Parameters - ---------- - ts : float[:] - The time series - feature_importance : float[:] - The importance of each feature in the transformed data - limit : int, default = 5 - The maximum number of features to plot - - Returns - ------- - fig : plt.figure - The figure - """ - features = zip(self._transformer._kernel_orig, feature_importance) - sorted_features = sorted(features, key=itemgetter(1), reverse=True) - - max_ = min(limit, len(sorted_features)) - - fig, axes = plt.subplots( - 1, max_, sharey=True, figsize=(3 * max_, 3), tight_layout=True - ) - - for f in range(max_): - kernel, _ = sorted_features[f] - znorm_kernel = z_normalise_series(kernel) - d_best = np.inf - for i in range(ts.size - kernel.size): - s = ts[i : i + kernel.size] - s = z_normalise_series(s) - d = np.sum((s - znorm_kernel) ** 2) - if d < d_best: - d_best = d - start_pos = i - axes[f].plot(range(start_pos, start_pos + kernel.size), kernel, linewidth=5) - axes[f].plot(range(ts.size), ts, linewidth=2) - axes[f].set_title(f"feature: {f+1}") diff --git a/aeon/transformations/collection/shapelet_based/_rsast.py b/aeon/transformations/collection/shapelet_based/_rsast.py index 3ce7ce0277..e0e7f8abd2 100644 --- a/aeon/transformations/collection/shapelet_based/_rsast.py +++ b/aeon/transformations/collection/shapelet_based/_rsast.py @@ -34,7 +34,7 @@ def _apply_kernels(X, kernels): class RSAST(BaseCollectionTransformer): - """Random Scalable and Accurate Subsequence Transform (SAST). + """Random Scalable and Accurate Subsequence Transform (RSAST). RSAST [1] is based on SAST, it uses a stratified sampling strategy for subsequences selection but additionally takes into account certain @@ -90,8 +90,8 @@ class RSAST(BaseCollectionTransformer): _tags = { "output_data_type": "Tabular", "capability:multivariate": False, - "algorithm_type": "subsequence", - "python_dependencies": ["statsmodels"], + "algorithm_type": "shapelet", + "python_dependencies": "statsmodels", } def __init__(