From aff5ed57393b418bc0669000dda45f4df42e63b6 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Tue, 14 Jan 2025 10:26:05 +0000 Subject: [PATCH 1/9] first draft --- .../collection/imbalance/__init__.py | 1 + .../collection/imbalance/_smote.py | 30 +++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 aeon/transformations/collection/imbalance/__init__.py create mode 100644 aeon/transformations/collection/imbalance/_smote.py diff --git a/aeon/transformations/collection/imbalance/__init__.py b/aeon/transformations/collection/imbalance/__init__.py new file mode 100644 index 0000000000..eeff2f5d85 --- /dev/null +++ b/aeon/transformations/collection/imbalance/__init__.py @@ -0,0 +1 @@ +"""Supervised transformers to rebalance colelctions of time series.""" diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py new file mode 100644 index 0000000000..24078d180e --- /dev/null +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -0,0 +1,30 @@ +"""Wrapper for imblearn minority class rebalancer SMOTE.""" + +from imblearn.over_sampling import SMOTE as smote + +from aeon.transformations.collection import BaseCollectionTransformer + +__maintainer__ = ["TonyBagnall"] +__all__ = ["SMOTE"] + + +class SMOTE(BaseCollectionTransformer): + """Wrapper for SMOTE transform.""" + + _tags = { + "capability:multivariate": True, + "capability:unequal_length": True, + "requires_y": True, + } + + def __init__(self, sampling_strategy="auto", random_state=None, k_neighbors=5): + self.sampling_strategy = sampling_strategy + self.random_state = random_state + self.k_neighbors = k_neighbors + + def _fit(self, X, y=None): + self.smote_ = smote(self.sampling_strategy, self.random_state, self.k_neighbors) + self.smote_.fit(X, y) + + def _transform(self, X, y=None): + return self.smote_.resample(X, y) From 4bec820c8b9995cacad10261b736f004541d53ce Mon Sep 17 00:00:00 2001 From: Chuanhang Qiu <80885865+LinGinQiu@users.noreply.github.com> Date: Thu, 23 Jan 2025 15:00:27 +0000 Subject: [PATCH 2/9] [ENH] wrapper for smote and adasyn of the imbalance module in collection transformers (#2501) * smote & adasyn in aeon.transformation.imbalance * smote & adasyn in aeon.transformation.imbalance * smote & adasyn in aeon.transformation.imbalance * smote & adasyn in aeon.transformation.imbalance --- .../collection/imbalance/__init__.py | 5 + .../collection/imbalance/_adasyn.py | 140 ++++++++++++ .../collection/imbalance/_smote.py | 216 +++++++++++++++++- .../collection/tests/test_imbalance.py | 60 +++++ 4 files changed, 411 insertions(+), 10 deletions(-) create mode 100644 aeon/transformations/collection/imbalance/_adasyn.py create mode 100644 aeon/transformations/collection/tests/test_imbalance.py diff --git a/aeon/transformations/collection/imbalance/__init__.py b/aeon/transformations/collection/imbalance/__init__.py index eeff2f5d85..280251ad04 100644 --- a/aeon/transformations/collection/imbalance/__init__.py +++ b/aeon/transformations/collection/imbalance/__init__.py @@ -1 +1,6 @@ """Supervised transformers to rebalance colelctions of time series.""" + +__all__ = ["SMOTE", "ADASYN"] + +from aeon.transformations.collection.imbalance._smote import SMOTE +from aeon.transformations.collection.imbalance._adasyn import ADASYN diff --git a/aeon/transformations/collection/imbalance/_adasyn.py b/aeon/transformations/collection/imbalance/_adasyn.py new file mode 100644 index 0000000000..72818b72a8 --- /dev/null +++ b/aeon/transformations/collection/imbalance/_adasyn.py @@ -0,0 +1,140 @@ +""" +implement for imblearn minority class rebalancer ADASYN. +see more in imblearn.over_sampling.ADASYN +original authors: +# Guillaume Lemaitre +# Christos Aridas +# License: MIT +""" +import numpy as np +from aeon.transformations.collection import BaseCollectionTransformer +from sklearn.neighbors import NearestNeighbors +from sklearn.utils import check_random_state +from scipy import sparse +from collections import OrderedDict + +__maintainer__ = ["TonyBagnall, Chris Qiu"] +__all__ = ["ADASYN"] + + +class ADASYN(BaseCollectionTransformer): + """ + Class to perform over-sampling using ADASYN . + + This object is a simplified implementation of ADASYN - Adaptive + Synthetic (ADASYN) algorithm as presented in imblearn.over_sampling.ADASYN + This method is similar to SMOTE, but it generates different number of + samples depending on an estimate of the local distribution of the class + to be oversampled. + Parameters + ---------- + {random_state} + + k_neighbors : int or object, default=5 + The nearest neighbors used to define the neighborhood of samples to use + to generate the synthetic samples. `~sklearn.neighbors.NearestNeighbors` + instance will be fitted in this case. + """ + + _tags = { + "capability:multivariate": True, + "capability:unequal_length": True, + "requires_y": True, + } + + def __init__(self, random_state=None, k_neighbors=5): + self.random_state = random_state + self.k_neighbors = k_neighbors + super().__init__() + + def _fit(self, X, y=None): + # set the additional_neighbor=1 + self.nn_ = NearestNeighbors(n_neighbors=self.k_neighbors + 1) + + # generate sampling target by targeting all classes but not the majority + unique, counts = np.unique(y, return_counts=True) + target_stats = dict(zip(unique, counts)) + n_sample_majority = max(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + sampling_strategy = { + key: n_sample_majority - value + for (key, value) in target_stats.items() + if key != class_majority + } + self.sampling_strategy_ = OrderedDict( + sorted(sampling_strategy.items()) + ) + return self + + def _transform(self, X, y=None): + shape_recover = False # use to recover the shape of X + if X.ndim == 3 and X.shape[1] == 1: + X = np.squeeze(X, axis=1) # remove the middle dimension to be compatible with sklearn + shape_recover = True + random_state = check_random_state(self.random_state) + X_resampled = [X.copy()] + y_resampled = [y.copy()] + + # got the minority class label and the number needs to be generated i.e. num_majority - num_minority + for class_sample, n_samples in self.sampling_strategy_.items(): + if n_samples == 0: + continue + target_class_indices = np.flatnonzero(y == class_sample) + X_class = X[target_class_indices] + + self.nn_.fit(X) + nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] + # The ratio is computed using a one-vs-rest manner. Using majority + # in multi-class would lead to slightly different results at the + # cost of introducing a new parameter. + n_neighbors = self.nn_.n_neighbors - 1 + ratio_nn = np.sum(y[nns] != class_sample, axis=1) / n_neighbors + if not np.sum(ratio_nn): + raise RuntimeError( + "Not any neigbours belong to the majority" + " class. This case will induce a NaN case" + " with a division by zero. ADASYN is not" + " suited for this specific dataset." + " Use SMOTE instead." + ) + ratio_nn /= np.sum(ratio_nn) + n_samples_generate = np.rint(ratio_nn * n_samples).astype(int) + # rounding may cause new amount for n_samples + n_samples = np.sum(n_samples_generate) + if not n_samples: + raise ValueError( + "No samples will be generated with the provided ratio settings." + ) + + # the nearest neighbors need to be fitted only on the current class + # to find the class NN to generate new samples + self.nn_.fit(X_class) + nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] + + enumerated_class_indices = np.arange(len(target_class_indices)) + rows = np.repeat(enumerated_class_indices, n_samples_generate) + cols = random_state.choice(n_neighbors, size=n_samples) + diffs = X_class[nns[rows, cols]] - X_class[rows] + steps = random_state.uniform(size=(n_samples, 1)) + + if sparse.issparse(X): + sparse_func = type(X).__name__ + steps = getattr(sparse, sparse_func)(steps) + X_new = X_class[rows] + steps.multiply(diffs) + else: + X_new = X_class[rows] + steps * diffs + + X_new = X_new.astype(X.dtype) + y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype) + X_resampled.append(X_new) + y_resampled.append(y_new) + + if sparse.issparse(X): + X_resampled = sparse.vstack(X_resampled, format=X.format) + else: + X_resampled = np.vstack(X_resampled) + y_resampled = np.hstack(y_resampled) + + if shape_recover: + X_resampled = X_resampled[:, np.newaxis, :] + return X_resampled, y_resampled diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index 24078d180e..36aea38b1c 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -1,15 +1,42 @@ -"""Wrapper for imblearn minority class rebalancer SMOTE.""" - -from imblearn.over_sampling import SMOTE as smote +""" +implement for imblearn minority class rebalancer SMOTE. +see more in imblearn.over_sampling.SMOTE +original authors: +# Guillaume Lemaitre +# Fernando Nogueira +# Christos Aridas +# Dzianis Dudnik +# License: MIT +""" +import numpy as np from aeon.transformations.collection import BaseCollectionTransformer +from sklearn.neighbors import NearestNeighbors +from sklearn.utils import check_random_state +from scipy import sparse +from collections import OrderedDict -__maintainer__ = ["TonyBagnall"] +__maintainer__ = ["TonyBagnall, Chris Qiu"] __all__ = ["SMOTE"] class SMOTE(BaseCollectionTransformer): - """Wrapper for SMOTE transform.""" + """ + Class to perform over-sampling using SMOTE. + + This object is a simplified implementation of SMOTE - Synthetic Minority + Over-sampling Technique as presented in imblearn.over_sampling.SMOTE + sampling_strategy is sampling target by targeting all classes but not the + majority, which directly expressed in _fit.sampling_strategy. + Parameters + ---------- + {random_state} + + k_neighbors : int or object, default=5 + The nearest neighbors used to define the neighborhood of samples to use + to generate the synthetic samples. `~sklearn.neighbors.NearestNeighbors` + instance will be fitted in this case. + """ _tags = { "capability:multivariate": True, @@ -17,14 +44,183 @@ class SMOTE(BaseCollectionTransformer): "requires_y": True, } - def __init__(self, sampling_strategy="auto", random_state=None, k_neighbors=5): - self.sampling_strategy = sampling_strategy + def __init__(self, random_state=None, k_neighbors=5): self.random_state = random_state self.k_neighbors = k_neighbors + super().__init__() def _fit(self, X, y=None): - self.smote_ = smote(self.sampling_strategy, self.random_state, self.k_neighbors) - self.smote_.fit(X, y) + # set the additional_neighbor=1 + self.nn_ = NearestNeighbors(n_neighbors=self.k_neighbors + 1) + + # generate sampling target by targeting all classes but not the majority + unique, counts = np.unique(y, return_counts=True) + target_stats = dict(zip(unique, counts)) + n_sample_majority = max(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + sampling_strategy = { + key: n_sample_majority - value + for (key, value) in target_stats.items() + if key != class_majority + } + self.sampling_strategy_ = OrderedDict( + sorted(sampling_strategy.items()) + ) + return self def _transform(self, X, y=None): - return self.smote_.resample(X, y) + shape_recover = False # use to recover the shape of X + if X.ndim == 3 and X.shape[1] == 1: + X = np.squeeze(X, axis=1) # remove the middle dimension to be compatible with sklearn + shape_recover = True + X_resampled = [X.copy()] + y_resampled = [y.copy()] + + # got the minority class label and the number needs to be generated i.e. num_majority - num_minority + for class_sample, n_samples in self.sampling_strategy_.items(): + if n_samples == 0: + continue + target_class_indices = np.flatnonzero(y == class_sample) + X_class = X[target_class_indices] + + self.nn_.fit(X_class) + nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] + X_new, y_new = self._make_samples( + X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0 + ) + X_resampled.append(X_new) + y_resampled.append(y_new) + + if sparse.issparse(X): + X_resampled = sparse.vstack(X_resampled, format=X.format) + else: + X_resampled = np.vstack(X_resampled) + y_resampled = np.hstack(y_resampled) + if shape_recover: + X_resampled = X_resampled[:, np.newaxis, :] + return X_resampled, y_resampled + + def _make_samples( + self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None + ): + """A support function that returns artificial samples constructed along + the line connecting nearest neighbours. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Points from which the points will be created. + + y_dtype : dtype + The data type of the targets. + + y_type : str or int + The minority target value, just so the function can return the + target values for the synthetic variables with correct length in + a clear format. + + nn_data : ndarray of shape (n_samples_all, n_features) + Data set carrying all the neighbours to be used + + nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours) + The nearest neighbours of each sample in `nn_data`. + + n_samples : int + The number of samples to generate. + + step_size : float, default=1.0 + The step size to create samples. + + y : ndarray of shape (n_samples_all,), default=None + The true target associated with `nn_data`. Used by Borderline SMOTE-2 to + weight the distances in the sample generation process. + + Returns + ------- + X_new : {ndarray, sparse matrix} of shape (n_samples_new, n_features) + Synthetically generated samples. + + y_new : ndarray of shape (n_samples_new,) + Target values for synthetic samples. + """ + random_state = check_random_state(self.random_state) + samples_indices = random_state.randint(low=0, high=nn_num.size, size=n_samples) + + # np.newaxis for backwards compatability with random_state + steps = step_size * random_state.uniform(size=n_samples)[:, np.newaxis] + rows = np.floor_divide(samples_indices, nn_num.shape[1]) + cols = np.mod(samples_indices, nn_num.shape[1]) + + X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type, y) + y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype) + return X_new, y_new + + def _generate_samples( + self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None + ): + r"""Generate a synthetic sample. + + The rule for the generation is: + + .. math:: + \mathbf{s_{s}} = \mathbf{s_{i}} + \mathcal{u}(0, 1) \times + (\mathbf{s_{i}} - \mathbf{s_{nn}}) \, + + where \mathbf{s_{s}} is the new synthetic samples, \mathbf{s_{i}} is + the current sample, \mathbf{s_{nn}} is a randomly selected neighbors of + \mathbf{s_{i}} and \mathcal{u}(0, 1) is a random number between [0, 1). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Points from which the points will be created. + + nn_data : ndarray of shape (n_samples_all, n_features) + Data set carrying all the neighbours to be used. + + nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours) + The nearest neighbours of each sample in `nn_data`. + + rows : ndarray of shape (n_samples,), dtype=int + Indices pointing at feature vector in X which will be used + as a base for creating new samples. + + cols : ndarray of shape (n_samples,), dtype=int + Indices pointing at which nearest neighbor of base feature vector + will be used when creating new samples. + + steps : ndarray of shape (n_samples,), dtype=float + Step sizes for new samples. + + y_type : str, int or None, default=None + Class label of the current target classes for which we want to generate + samples. + + y : ndarray of shape (n_samples_all,), default=None + The true target associated with `nn_data`. Used by Borderline SMOTE-2 to + weight the distances in the sample generation process. + + Returns + ------- + X_new : {ndarray, sparse matrix} of shape (n_samples, n_features) + Synthetically generated samples. + """ + diffs = nn_data[nn_num[rows, cols]] - X[rows] + if y is not None: # only entering for BorderlineSMOTE-2 + random_state = check_random_state(self.random_state) + mask_pair_samples = y[nn_num[rows, cols]] != y_type + diffs[mask_pair_samples] *= random_state.uniform( + low=0.0, high=0.5, size=(mask_pair_samples.sum(), 1) + ) + + if sparse.issparse(X): + sparse_func = type(X).__name__ + steps = getattr(sparse, sparse_func)(steps) + X_new = X[rows] + steps.multiply(diffs) + else: + X_new = X[rows] + steps * diffs + + return X_new.astype(X.dtype) + + + diff --git a/aeon/transformations/collection/tests/test_imbalance.py b/aeon/transformations/collection/tests/test_imbalance.py new file mode 100644 index 0000000000..f56df6fcfe --- /dev/null +++ b/aeon/transformations/collection/tests/test_imbalance.py @@ -0,0 +1,60 @@ +"""Tests for the rebalancer transformers.""" + +import numpy as np +import pytest + +from aeon.transformations.collection.imbalance import SMOTE, ADASYN + + +def test_smote(): + """Test the SMOTE class. + + This function creates a 3D numpy array, applies + SMOTE using the SMOTE class, and asserts that the + transformed data has a balanced number of samples. + """ + n_samples = 100 # Total number of labels + majority_num = 90 # number of majority class + minority_num = n_samples - majority_num # number of minority class + + X = np.random.rand(n_samples, 1, 10) + y = np.array([0] * majority_num + [1] * minority_num) + + transformer = SMOTE() + transformer.fit(X, y) + res_X, res_y = transformer.transform(X, y) + _, res_count = np.unique(res_y, return_counts=True) + + assert len(res_X) == 2 * majority_num + assert len(res_y) == 2 * majority_num + assert res_count[0] == majority_num + assert res_count[1] == majority_num + + +def test_adasyn(): + """Test the ADASYN class. + + This function creates a 3D numpy array, applies + ADASYN using the ADASYN class, and asserts that the + transformed data has a balanced number of samples. + ADASYN is a variant of SMOTE that generates synthetic samples, + but it focuses on generating samples near the decision boundary. + Therefore, sometimes, it may generate more or less samples than SMOTE, + which is why we only check if the number of samples is nearly balanced. + """ + n_samples = 100 # Total number of labels + majority_num = 90 # number of majority class + minority_num = n_samples - majority_num # number of minority class + + X = np.random.rand(n_samples, 1, 10) + y = np.array([0] * majority_num + [1] * minority_num) + + transformer = ADASYN() + transformer.fit(X, y) + res_X, res_y = transformer.transform(X, y) + _, res_count = np.unique(res_y, return_counts=True) + + assert np.abs(len(res_X) - 2 * majority_num) < minority_num + assert np.abs(len(res_y) - 2 * majority_num) < minority_num + assert res_count[0] == majority_num + assert np.abs(res_count[0] - res_count[1]) < minority_num From 5db24f39201b630cc6e125027774c9fd1c2fca0b Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Thu, 23 Jan 2025 16:50:11 +0000 Subject: [PATCH 3/9] make experimental --- README.md | 1 + .../collection/imbalance/_adasyn.py | 46 +++++++----- .../collection/imbalance/_smote.py | 75 ++++++++++--------- .../collection/imbalance/tests/__init__.py | 1 + docs/developer_guide/deprecation.md | 1 + docs/index.md | 1 + 6 files changed, 71 insertions(+), 54 deletions(-) create mode 100644 aeon/transformations/collection/imbalance/tests/__init__.py diff --git a/README.md b/README.md index e1475d6d85..e267f053fb 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ does not apply: - `segmentation` - `similarity_search` - `visualisation` +- `transformations.collection.imbalance` | Overview | | |-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| diff --git a/aeon/transformations/collection/imbalance/_adasyn.py b/aeon/transformations/collection/imbalance/_adasyn.py index 72818b72a8..2db87c36d1 100644 --- a/aeon/transformations/collection/imbalance/_adasyn.py +++ b/aeon/transformations/collection/imbalance/_adasyn.py @@ -1,17 +1,20 @@ -""" -implement for imblearn minority class rebalancer ADASYN. +"""ADASYN over sampling algorithm. + see more in imblearn.over_sampling.ADASYN original authors: # Guillaume Lemaitre # Christos Aridas # License: MIT """ + +from collections import OrderedDict + import numpy as np -from aeon.transformations.collection import BaseCollectionTransformer +from scipy import sparse from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from scipy import sparse -from collections import OrderedDict + +from aeon.transformations.collection import BaseCollectionTransformer __maintainer__ = ["TonyBagnall, Chris Qiu"] __all__ = ["ADASYN"] @@ -26,20 +29,27 @@ class ADASYN(BaseCollectionTransformer): This method is similar to SMOTE, but it generates different number of samples depending on an estimate of the local distribution of the class to be oversampled. + + Currently only works with two class problems. + Parameters ---------- - {random_state} - k_neighbors : int or object, default=5 The nearest neighbors used to define the neighborhood of samples to use to generate the synthetic samples. `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. + random_state : int, RandomState instance or None, default=None + If `int`, random_state is the seed used by the random number generator; + If `RandomState` instance, random_state is the random number generator; + If `None`, the random number generator is the `RandomState` instance used + by `np.random`. """ _tags = { - "capability:multivariate": True, - "capability:unequal_length": True, + "capability:multivariate": False, + "capability:unequal_length": False, "requires_y": True, + "python_dependencies": "imbalanced-learn", } def __init__(self, random_state=None, k_neighbors=5): @@ -51,9 +61,11 @@ def _fit(self, X, y=None): # set the additional_neighbor=1 self.nn_ = NearestNeighbors(n_neighbors=self.k_neighbors + 1) - # generate sampling target by targeting all classes but not the majority + # resamples all classes except the majority. unique, counts = np.unique(y, return_counts=True) target_stats = dict(zip(unique, counts)) + # If two or more classes are equal largest, the majority is assumed to be the + # one with the largest index. n_sample_majority = max(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) sampling_strategy = { @@ -61,21 +73,16 @@ def _fit(self, X, y=None): for (key, value) in target_stats.items() if key != class_majority } - self.sampling_strategy_ = OrderedDict( - sorted(sampling_strategy.items()) - ) + self.sampling_strategy_ = OrderedDict(sorted(sampling_strategy.items())) return self def _transform(self, X, y=None): - shape_recover = False # use to recover the shape of X - if X.ndim == 3 and X.shape[1] == 1: - X = np.squeeze(X, axis=1) # remove the middle dimension to be compatible with sklearn - shape_recover = True + X = np.squeeze(X, axis=1) random_state = check_random_state(self.random_state) X_resampled = [X.copy()] y_resampled = [y.copy()] - # got the minority class label and the number needs to be generated i.e. num_majority - num_minority + # got the minority class label and the number needs to be generated for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue @@ -135,6 +142,5 @@ def _transform(self, X, y=None): X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) - if shape_recover: - X_resampled = X_resampled[:, np.newaxis, :] + X_resampled = X_resampled[:, np.newaxis, :] return X_resampled, y_resampled diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index 36aea38b1c..604179dded 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -1,6 +1,6 @@ -""" -implement for imblearn minority class rebalancer SMOTE. -see more in imblearn.over_sampling.SMOTE +"""SMOTE over sampling algorithm. + +See more in imblearn.over_sampling.SMOTE original authors: # Guillaume Lemaitre # Fernando Nogueira @@ -9,42 +9,58 @@ # License: MIT """ +from collections import OrderedDict + import numpy as np -from aeon.transformations.collection import BaseCollectionTransformer +from scipy import sparse from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from scipy import sparse -from collections import OrderedDict -__maintainer__ = ["TonyBagnall, Chris Qiu"] +from aeon.transformations.collection import BaseCollectionTransformer + +__maintainer__ = ["TonyBagnall"] __all__ = ["SMOTE"] class SMOTE(BaseCollectionTransformer): """ - Class to perform over-sampling using SMOTE. + Over-sampling using the Synthetic Minority Over-sampling TEchnique (SMOTE)[1]_. + + An adaptation of the imbalance-learn implementation of SMOTE in + imblearn.over_sampling.SMOTE. sampling_strategy is sampling target by + targeting all classes but not the majority, which is directly expressed in + _fit.sampling_strategy. - This object is a simplified implementation of SMOTE - Synthetic Minority - Over-sampling Technique as presented in imblearn.over_sampling.SMOTE - sampling_strategy is sampling target by targeting all classes but not the - majority, which directly expressed in _fit.sampling_strategy. Parameters ---------- - {random_state} - k_neighbors : int or object, default=5 The nearest neighbors used to define the neighborhood of samples to use to generate the synthetic samples. `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. + random_state : int, RandomState instance or None, default=None + If `int`, random_state is the seed used by the random number generator; + If `RandomState` instance, random_state is the random number generator; + If `None`, the random number generator is the `RandomState` instance used + by `np.random`. + + See Also + -------- + ADASYN + + References + ---------- + .. [1] Chawla et al. SMOTE: synthetic minority over-sampling technique, Journal + of Artificial Intelligence Research 16(1): 321–357, 2002. + https://dl.acm.org/doi/10.5555/1622407.1622416 """ _tags = { - "capability:multivariate": True, - "capability:unequal_length": True, + "capability:multivariate": False, + "capability:unequal_length": False, "requires_y": True, } - def __init__(self, random_state=None, k_neighbors=5): + def __init__(self, k_neighbors=5, random_state=None): self.random_state = random_state self.k_neighbors = k_neighbors super().__init__() @@ -63,20 +79,16 @@ def _fit(self, X, y=None): for (key, value) in target_stats.items() if key != class_majority } - self.sampling_strategy_ = OrderedDict( - sorted(sampling_strategy.items()) - ) + self.sampling_strategy_ = OrderedDict(sorted(sampling_strategy.items())) return self def _transform(self, X, y=None): - shape_recover = False # use to recover the shape of X - if X.ndim == 3 and X.shape[1] == 1: - X = np.squeeze(X, axis=1) # remove the middle dimension to be compatible with sklearn - shape_recover = True + # remove the channel dimension to be compatible with sklearn + X = np.squeeze(X, axis=1) X_resampled = [X.copy()] y_resampled = [y.copy()] - # got the minority class label and the number needs to be generated i.e. num_majority - num_minority + # got the minority class label and the number needs to be generated for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue @@ -96,15 +108,13 @@ def _transform(self, X, y=None): else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) - if shape_recover: - X_resampled = X_resampled[:, np.newaxis, :] + X_resampled = X_resampled[:, np.newaxis, :] return X_resampled, y_resampled def _make_samples( - self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None + self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None ): - """A support function that returns artificial samples constructed along - the line connecting nearest neighbours. + """Make artificial samples constructed based on nearest neighbours. Parameters ---------- @@ -156,7 +166,7 @@ def _make_samples( return X_new, y_new def _generate_samples( - self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None + self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None ): r"""Generate a synthetic sample. @@ -221,6 +231,3 @@ def _generate_samples( X_new = X[rows] + steps * diffs return X_new.astype(X.dtype) - - - diff --git a/aeon/transformations/collection/imbalance/tests/__init__.py b/aeon/transformations/collection/imbalance/tests/__init__.py new file mode 100644 index 0000000000..55831a6ec8 --- /dev/null +++ b/aeon/transformations/collection/imbalance/tests/__init__.py @@ -0,0 +1 @@ +"""Test resampling transformers.""" diff --git a/docs/developer_guide/deprecation.md b/docs/developer_guide/deprecation.md index 4b10d81cb2..04aadbab3a 100644 --- a/docs/developer_guide/deprecation.md +++ b/docs/developer_guide/deprecation.md @@ -24,6 +24,7 @@ experimental. Currently experimental modules are: - `segmentation` - `similarity_search` - `visualisation` +- `transformations.collection.imbalance` When we introduce a new module, we may classify it as experimental until the API is stable. We will try to not make drastic changes to experimental modules, but we need diff --git a/docs/index.md b/docs/index.md index 11b558839e..76fb04e1ce 100644 --- a/docs/index.md +++ b/docs/index.md @@ -276,6 +276,7 @@ experimental modules are: - `segmentation` - `similarity_search` - `visualisation` +- `transformations.collection.imbalance` ```{toctree} :caption: Using aeon From d9b35b79b5df40010cbe792a283009ddbff293d3 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Thu, 23 Jan 2025 17:01:17 +0000 Subject: [PATCH 4/9] inherit from SMOTE --- .../collection/imbalance/_adasyn.py | 78 ++++--------------- 1 file changed, 16 insertions(+), 62 deletions(-) diff --git a/aeon/transformations/collection/imbalance/_adasyn.py b/aeon/transformations/collection/imbalance/_adasyn.py index 2db87c36d1..6b487529d8 100644 --- a/aeon/transformations/collection/imbalance/_adasyn.py +++ b/aeon/transformations/collection/imbalance/_adasyn.py @@ -1,80 +1,34 @@ -"""ADASYN over sampling algorithm. - -see more in imblearn.over_sampling.ADASYN -original authors: -# Guillaume Lemaitre -# Christos Aridas -# License: MIT -""" - -from collections import OrderedDict +"""ADASYN over sampling algorithm.""" import numpy as np from scipy import sparse -from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from aeon.transformations.collection import BaseCollectionTransformer +from aeon.transformations.collection.imbalance import SMOTE -__maintainer__ = ["TonyBagnall, Chris Qiu"] +__maintainer__ = ["TonyBagnall"] __all__ = ["ADASYN"] -class ADASYN(BaseCollectionTransformer): +class ADASYN(SMOTE): """ - Class to perform over-sampling using ADASYN . + Over-sampling using Adaptive Synthetic Sampling (ADASYN). - This object is a simplified implementation of ADASYN - Adaptive - Synthetic (ADASYN) algorithm as presented in imblearn.over_sampling.ADASYN - This method is similar to SMOTE, but it generates different number of + Adaptation of imblearn.over_sampling.ADASYN + original authors: + # Guillaume Lemaitre + # Christos Aridas + # License: MIT + + This transformer extends SMOTE, but it generates different number of samples depending on an estimate of the local distribution of the class to be oversampled. - - Currently only works with two class problems. - - Parameters - ---------- - k_neighbors : int or object, default=5 - The nearest neighbors used to define the neighborhood of samples to use - to generate the synthetic samples. `~sklearn.neighbors.NearestNeighbors` - instance will be fitted in this case. - random_state : int, RandomState instance or None, default=None - If `int`, random_state is the seed used by the random number generator; - If `RandomState` instance, random_state is the random number generator; - If `None`, the random number generator is the `RandomState` instance used - by `np.random`. """ - _tags = { - "capability:multivariate": False, - "capability:unequal_length": False, - "requires_y": True, - "python_dependencies": "imbalanced-learn", - } - - def __init__(self, random_state=None, k_neighbors=5): - self.random_state = random_state - self.k_neighbors = k_neighbors - super().__init__() - - def _fit(self, X, y=None): - # set the additional_neighbor=1 - self.nn_ = NearestNeighbors(n_neighbors=self.k_neighbors + 1) - - # resamples all classes except the majority. - unique, counts = np.unique(y, return_counts=True) - target_stats = dict(zip(unique, counts)) - # If two or more classes are equal largest, the majority is assumed to be the - # one with the largest index. - n_sample_majority = max(target_stats.values()) - class_majority = max(target_stats, key=target_stats.get) - sampling_strategy = { - key: n_sample_majority - value - for (key, value) in target_stats.items() - if key != class_majority - } - self.sampling_strategy_ = OrderedDict(sorted(sampling_strategy.items())) - return self + def __init__( + self, + ): + super().__init__(random_state=None, k_neighbors=5) def _transform(self, X, y=None): X = np.squeeze(X, axis=1) From 97c7466d076899b54119c7987fdebeaf65ebafbe Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Fri, 24 Jan 2025 11:12:58 +0000 Subject: [PATCH 5/9] test equivalence to imblearn --- .../collection/imbalance/_adasyn.py | 6 ++-- .../collection/imbalance/_smote.py | 4 +-- .../collection/imbalance/tests/test_adasyn.py | 32 +++++++++++++++++++ .../collection/imbalance/tests/test_smote.py | 32 +++++++++++++++++++ 4 files changed, 68 insertions(+), 6 deletions(-) create mode 100644 aeon/transformations/collection/imbalance/tests/test_adasyn.py create mode 100644 aeon/transformations/collection/imbalance/tests/test_smote.py diff --git a/aeon/transformations/collection/imbalance/_adasyn.py b/aeon/transformations/collection/imbalance/_adasyn.py index 6b487529d8..0d78637f86 100644 --- a/aeon/transformations/collection/imbalance/_adasyn.py +++ b/aeon/transformations/collection/imbalance/_adasyn.py @@ -25,10 +25,8 @@ class ADASYN(SMOTE): to be oversampled. """ - def __init__( - self, - ): - super().__init__(random_state=None, k_neighbors=5) + def __init__(self, random_state=None, k_neighbors=5): + super().__init__(random_state=random_state, k_neighbors=k_neighbors) def _transform(self, X, y=None): X = np.squeeze(X, axis=1) diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index 604179dded..f56e6f7b40 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -66,10 +66,10 @@ def __init__(self, k_neighbors=5, random_state=None): super().__init__() def _fit(self, X, y=None): - # set the additional_neighbor=1 + # set the additional_neighbor required by SMOTE self.nn_ = NearestNeighbors(n_neighbors=self.k_neighbors + 1) - # generate sampling target by targeting all classes but not the majority + # generate sampling target by targeting all classes except the majority unique, counts = np.unique(y, return_counts=True) target_stats = dict(zip(unique, counts)) n_sample_majority = max(target_stats.values()) diff --git a/aeon/transformations/collection/imbalance/tests/test_adasyn.py b/aeon/transformations/collection/imbalance/tests/test_adasyn.py new file mode 100644 index 0000000000..3557f85cb4 --- /dev/null +++ b/aeon/transformations/collection/imbalance/tests/test_adasyn.py @@ -0,0 +1,32 @@ +"""Test ADASYN oversampler ported from imblearn.""" + +import numpy as np +import pytest + +from aeon.testing.data_generation import make_example_3d_numpy +from aeon.transformations.collection.imbalance import ADASYN +from aeon.utils.validation._dependencies import _check_soft_dependencies + + +@pytest.mark.skipif( + not _check_soft_dependencies( + "imbalanced-learn", + package_import_alias={"imbalanced-learn": "imblearn"}, + severity="none", + ), + reason="skip test if required soft dependency imbalanced-learn not available", +) +def test_equivalence_imbalance(): + """Test ported ADASYN code produces the same as imblearn version.""" + from imblearn.over_sampling import ADASYN as imbADASYN + + X, y = make_example_3d_numpy(n_cases=20, n_channels=1) + y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + X = X.squeeze() + s1 = imbADASYN(random_state=49) + X2, y2 = s1.fit_resample(X, y) + s2 = ADASYN(random_state=49) + X3, y3 = s2.fit_transform(X, y) + X3 = X3.squeeze() + assert np.array_equal(y2, y3) + assert np.allclose(X2, X3, atol=1e-4) diff --git a/aeon/transformations/collection/imbalance/tests/test_smote.py b/aeon/transformations/collection/imbalance/tests/test_smote.py new file mode 100644 index 0000000000..53cc95cac7 --- /dev/null +++ b/aeon/transformations/collection/imbalance/tests/test_smote.py @@ -0,0 +1,32 @@ +"""Test function for SMOTE.""" + +import numpy as np +import pytest + +from aeon.testing.data_generation import make_example_3d_numpy +from aeon.transformations.collection.imbalance import SMOTE +from aeon.utils.validation._dependencies import _check_soft_dependencies + + +@pytest.mark.skipif( + not _check_soft_dependencies( + "imbalanced-learn", + package_import_alias={"imbalanced-learn": "imblearn"}, + severity="none", + ), + reason="skip test if required soft dependency imbalanced-learn not available", +) +def test_equivalence_imbalance(): + """Test ported SMOTE code produces the same as imblearn version.""" + from imblearn.over_sampling import SMOTE as imbSMOTE + + X, y = make_example_3d_numpy(n_cases=20, n_channels=1) + y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + X = X.squeeze() + s1 = imbSMOTE(random_state=49) + X2, y2 = s1.fit_resample(X, y) + s2 = SMOTE(random_state=49) + X3, y3 = s2.fit_transform(X, y) + X3 = X3.squeeze() + assert np.array_equal(y2, y3) + assert np.allclose(X2, X3, atol=1e-4) From a440a90a43e88d2149cf4a181875e4aad03360fb Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Fri, 24 Jan 2025 11:18:34 +0000 Subject: [PATCH 6/9] move tests --- .../collection/imbalance/tests/test_adasyn.py | 29 +++++++++ .../collection/imbalance/tests/test_smote.py | 25 ++++++++ .../collection/tests/test_imbalance.py | 60 ------------------- 3 files changed, 54 insertions(+), 60 deletions(-) delete mode 100644 aeon/transformations/collection/tests/test_imbalance.py diff --git a/aeon/transformations/collection/imbalance/tests/test_adasyn.py b/aeon/transformations/collection/imbalance/tests/test_adasyn.py index 3557f85cb4..0bb5c62ea6 100644 --- a/aeon/transformations/collection/imbalance/tests/test_adasyn.py +++ b/aeon/transformations/collection/imbalance/tests/test_adasyn.py @@ -8,6 +8,35 @@ from aeon.utils.validation._dependencies import _check_soft_dependencies +def test_adasyn(): + """Test the ADASYN class. + + This function creates a 3D numpy array, applies + ADASYN using the ADASYN class, and asserts that the + transformed data has a balanced number of samples. + ADASYN is a variant of SMOTE that generates synthetic samples, + but it focuses on generating samples near the decision boundary. + Therefore, sometimes, it may generate more or less samples than SMOTE, + which is why we only check if the number of samples is nearly balanced. + """ + n_samples = 100 # Total number of labels + majority_num = 90 # number of majority class + minority_num = n_samples - majority_num # number of minority class + + X = np.random.rand(n_samples, 1, 10) + y = np.array([0] * majority_num + [1] * minority_num) + + transformer = ADASYN() + transformer.fit(X, y) + res_X, res_y = transformer.transform(X, y) + _, res_count = np.unique(res_y, return_counts=True) + + assert np.abs(len(res_X) - 2 * majority_num) < minority_num + assert np.abs(len(res_y) - 2 * majority_num) < minority_num + assert res_count[0] == majority_num + assert np.abs(res_count[0] - res_count[1]) < minority_num + + @pytest.mark.skipif( not _check_soft_dependencies( "imbalanced-learn", diff --git a/aeon/transformations/collection/imbalance/tests/test_smote.py b/aeon/transformations/collection/imbalance/tests/test_smote.py index 53cc95cac7..70189633d0 100644 --- a/aeon/transformations/collection/imbalance/tests/test_smote.py +++ b/aeon/transformations/collection/imbalance/tests/test_smote.py @@ -8,6 +8,31 @@ from aeon.utils.validation._dependencies import _check_soft_dependencies +def test_smote(): + """Test the SMOTE class. + + This function creates a 3D numpy array, applies + SMOTE using the SMOTE class, and asserts that the + transformed data has a balanced number of samples. + """ + n_samples = 100 # Total number of labels + majority_num = 90 # number of majority class + minority_num = n_samples - majority_num # number of minority class + + X = np.random.rand(n_samples, 1, 10) + y = np.array([0] * majority_num + [1] * minority_num) + + transformer = SMOTE() + transformer.fit(X, y) + res_X, res_y = transformer.transform(X, y) + _, res_count = np.unique(res_y, return_counts=True) + + assert len(res_X) == 2 * majority_num + assert len(res_y) == 2 * majority_num + assert res_count[0] == majority_num + assert res_count[1] == majority_num + + @pytest.mark.skipif( not _check_soft_dependencies( "imbalanced-learn", diff --git a/aeon/transformations/collection/tests/test_imbalance.py b/aeon/transformations/collection/tests/test_imbalance.py deleted file mode 100644 index f56df6fcfe..0000000000 --- a/aeon/transformations/collection/tests/test_imbalance.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Tests for the rebalancer transformers.""" - -import numpy as np -import pytest - -from aeon.transformations.collection.imbalance import SMOTE, ADASYN - - -def test_smote(): - """Test the SMOTE class. - - This function creates a 3D numpy array, applies - SMOTE using the SMOTE class, and asserts that the - transformed data has a balanced number of samples. - """ - n_samples = 100 # Total number of labels - majority_num = 90 # number of majority class - minority_num = n_samples - majority_num # number of minority class - - X = np.random.rand(n_samples, 1, 10) - y = np.array([0] * majority_num + [1] * minority_num) - - transformer = SMOTE() - transformer.fit(X, y) - res_X, res_y = transformer.transform(X, y) - _, res_count = np.unique(res_y, return_counts=True) - - assert len(res_X) == 2 * majority_num - assert len(res_y) == 2 * majority_num - assert res_count[0] == majority_num - assert res_count[1] == majority_num - - -def test_adasyn(): - """Test the ADASYN class. - - This function creates a 3D numpy array, applies - ADASYN using the ADASYN class, and asserts that the - transformed data has a balanced number of samples. - ADASYN is a variant of SMOTE that generates synthetic samples, - but it focuses on generating samples near the decision boundary. - Therefore, sometimes, it may generate more or less samples than SMOTE, - which is why we only check if the number of samples is nearly balanced. - """ - n_samples = 100 # Total number of labels - majority_num = 90 # number of majority class - minority_num = n_samples - majority_num # number of minority class - - X = np.random.rand(n_samples, 1, 10) - y = np.array([0] * majority_num + [1] * minority_num) - - transformer = ADASYN() - transformer.fit(X, y) - res_X, res_y = transformer.transform(X, y) - _, res_count = np.unique(res_y, return_counts=True) - - assert np.abs(len(res_X) - 2 * majority_num) < minority_num - assert np.abs(len(res_y) - 2 * majority_num) < minority_num - assert res_count[0] == majority_num - assert np.abs(res_count[0] - res_count[1]) < minority_num From 6e24ef0c5327eabd284fa988095b92ae979b0ffe Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Fri, 24 Jan 2025 11:29:58 +0000 Subject: [PATCH 7/9] format --- aeon/transformations/collection/imbalance/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aeon/transformations/collection/imbalance/__init__.py b/aeon/transformations/collection/imbalance/__init__.py index 280251ad04..38441e9e9f 100644 --- a/aeon/transformations/collection/imbalance/__init__.py +++ b/aeon/transformations/collection/imbalance/__init__.py @@ -1,6 +1,6 @@ """Supervised transformers to rebalance colelctions of time series.""" -__all__ = ["SMOTE", "ADASYN"] +__all__ = ["ADASYN", "SMOTE"] -from aeon.transformations.collection.imbalance._smote import SMOTE from aeon.transformations.collection.imbalance._adasyn import ADASYN +from aeon.transformations.collection.imbalance._smote import SMOTE From c73111755a727ecdf804010b85a8f6295322f13b Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Fri, 24 Jan 2025 11:50:54 +0000 Subject: [PATCH 8/9] import --- .../collection/imbalance/_adasyn.py | 17 ++--------- .../collection/imbalance/_smote.py | 29 ++++++------------- 2 files changed, 12 insertions(+), 34 deletions(-) diff --git a/aeon/transformations/collection/imbalance/_adasyn.py b/aeon/transformations/collection/imbalance/_adasyn.py index 0d78637f86..412007009d 100644 --- a/aeon/transformations/collection/imbalance/_adasyn.py +++ b/aeon/transformations/collection/imbalance/_adasyn.py @@ -1,10 +1,9 @@ """ADASYN over sampling algorithm.""" import numpy as np -from scipy import sparse from sklearn.utils import check_random_state -from aeon.transformations.collection.imbalance import SMOTE +from aeon.transformations.collection.imbalance._smote import SMOTE __maintainer__ = ["TonyBagnall"] __all__ = ["ADASYN"] @@ -75,23 +74,13 @@ def _transform(self, X, y=None): cols = random_state.choice(n_neighbors, size=n_samples) diffs = X_class[nns[rows, cols]] - X_class[rows] steps = random_state.uniform(size=(n_samples, 1)) - - if sparse.issparse(X): - sparse_func = type(X).__name__ - steps = getattr(sparse, sparse_func)(steps) - X_new = X_class[rows] + steps.multiply(diffs) - else: - X_new = X_class[rows] + steps * diffs + X_new = X_class[rows] + steps * diffs X_new = X_new.astype(X.dtype) y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype) X_resampled.append(X_new) y_resampled.append(y_new) - - if sparse.issparse(X): - X_resampled = sparse.vstack(X_resampled, format=X.format) - else: - X_resampled = np.vstack(X_resampled) + X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) X_resampled = X_resampled[:, np.newaxis, :] diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index f56e6f7b40..f8b7084e5e 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -12,7 +12,6 @@ from collections import OrderedDict import numpy as np -from scipy import sparse from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state @@ -33,10 +32,10 @@ class SMOTE(BaseCollectionTransformer): Parameters ---------- - k_neighbors : int or object, default=5 - The nearest neighbors used to define the neighborhood of samples to use - to generate the synthetic samples. `~sklearn.neighbors.NearestNeighbors` - instance will be fitted in this case. + k_neighbors : int, default=5 + The number of nearest neighbors used to define the neighborhood of samples + to use to generate the synthetic time series. + `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. random_state : int, RandomState instance or None, default=None If `int`, random_state is the seed used by the random number generator; If `RandomState` instance, random_state is the random number generator; @@ -102,11 +101,7 @@ def _transform(self, X, y=None): ) X_resampled.append(X_new) y_resampled.append(y_new) - - if sparse.issparse(X): - X_resampled = sparse.vstack(X_resampled, format=X.format) - else: - X_resampled = np.vstack(X_resampled) + X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) X_resampled = X_resampled[:, np.newaxis, :] return X_resampled, y_resampled @@ -118,8 +113,9 @@ def _make_samples( Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - Points from which the points will be created. + X : np.ndarray + Shape (n_cases, n_timepoints), time series from which the new series will + be created. y_dtype : dtype The data type of the targets. @@ -222,12 +218,5 @@ def _generate_samples( diffs[mask_pair_samples] *= random_state.uniform( low=0.0, high=0.5, size=(mask_pair_samples.sum(), 1) ) - - if sparse.issparse(X): - sparse_func = type(X).__name__ - steps = getattr(sparse, sparse_func)(steps) - X_new = X[rows] + steps.multiply(diffs) - else: - X_new = X[rows] + steps * diffs - + X_new = X[rows] + steps * diffs return X_new.astype(X.dtype) From 770ea7515b35ac0a7fe104081e69eab6fae9ba50 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Fri, 24 Jan 2025 13:09:14 +0000 Subject: [PATCH 9/9] add test parameters --- .../_yield_estimator_checks.py | 5 ++- .../collection/imbalance/_smote.py | 38 +++++++++++++++---- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/aeon/testing/estimator_checking/_yield_estimator_checks.py b/aeon/testing/estimator_checking/_yield_estimator_checks.py index 70f714d4d9..b90e15df68 100644 --- a/aeon/testing/estimator_checking/_yield_estimator_checks.py +++ b/aeon/testing/estimator_checking/_yield_estimator_checks.py @@ -637,7 +637,10 @@ def check_persistence_via_pickle(estimator, datatype): def check_fit_deterministic(estimator, datatype): """Test that fit is deterministic. - Check that calling fit twice is equivalent to calling it once. + Check that calling fit twice is equivalent to calling it once, in terms of the + output of non-state changing methods such as predict and transform. Calls + fit, then calls all non-state changing methods, then calls fit and non-state + changing methods again, checking the output is the same. """ estimator = _clone_estimator(estimator, random_state=0) _run_estimator_method(estimator, "fit", datatype, "train") diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index f8b7084e5e..ee00c78174 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -54,8 +54,6 @@ class SMOTE(BaseCollectionTransformer): """ _tags = { - "capability:multivariate": False, - "capability:unequal_length": False, "requires_y": True, } @@ -143,11 +141,11 @@ def _make_samples( Returns ------- - X_new : {ndarray, sparse matrix} of shape (n_samples_new, n_features) - Synthetically generated samples. + X_new : ndarray + Synthetically generated samples of shape (n_samples_new, n_timepoints). - y_new : ndarray of shape (n_samples_new,) - Target values for synthetic samples. + y_new : ndarray + Target values for synthetic samples of shape (n_samples_new,). """ random_state = check_random_state(self.random_state) samples_indices = random_state.randint(low=0, high=nn_num.size, size=n_samples) @@ -178,8 +176,9 @@ def _generate_samples( Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - Points from which the points will be created. + X : np.ndarray + Series from which the points will be created of shape (n_cases, + n_timepoints). nn_data : ndarray of shape (n_samples_all, n_features) Data set carrying all the neighbours to be used. @@ -220,3 +219,26 @@ def _generate_samples( ) X_new = X[rows] + steps * diffs return X_new.astype(X.dtype) + + @classmethod + def _get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + ClassifierChannelEnsemble provides the following special sets: + - "results_comparison" - used in some classifiers to compare against + previously generated results where the default set of parameters + cannot produce suitable probability estimates + + Returns + ------- + params : dict or list of dict, default={} + Parameters to create testing instances of the class. + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + """ + return {"k_neighbors": 1}