From 5f927ab1107a0eca75157f6d1660dd2b865380e3 Mon Sep 17 00:00:00 2001 From: francesco Date: Mon, 16 Sep 2024 15:36:22 +0200 Subject: [PATCH 01/19] Add BORF to __init__.py --- aeon/transformations/collection/dictionary_based/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aeon/transformations/collection/dictionary_based/__init__.py b/aeon/transformations/collection/dictionary_based/__init__.py index 598f90733a..5f75cc9658 100644 --- a/aeon/transformations/collection/dictionary_based/__init__.py +++ b/aeon/transformations/collection/dictionary_based/__init__.py @@ -1,7 +1,8 @@ """Transformers.""" -__all__ = ["PAA", "SFA", "SFAFast", "SAX"] +__all__ = ["PAA", "SFA", "SFAFast", "SAX", "BORF"] +from aeon.transformations.collection.dictionary_based._borf import BORF from aeon.transformations.collection.dictionary_based._paa import PAA from aeon.transformations.collection.dictionary_based._sax import SAX from aeon.transformations.collection.dictionary_based._sfa import SFA From 19bfe31aedc1db51d9442c9c956efacb01232953 Mon Sep 17 00:00:00 2001 From: francesco Date: Mon, 16 Sep 2024 15:36:43 +0200 Subject: [PATCH 02/19] Add sparse dependency to pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index b3f94a196e..88281d0ada 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,6 +82,7 @@ all_extras = [ "xarray", "pyod>=1.1.3", "prts>=1.0.0.0", + "sparse", # for some reason, a heirarchical dask test fails on some Python versions with the # later versions of pyarrow and dask From 40bbb8f7aaed7828072f8aca58444d264cef3fa7 Mon Sep 17 00:00:00 2001 From: francesco Date: Mon, 16 Sep 2024 15:36:51 +0200 Subject: [PATCH 03/19] Add BORF transformer --- .../collection/dictionary_based/_borf.py | 1145 +++++++++++++++++ 1 file changed, 1145 insertions(+) create mode 100644 aeon/transformations/collection/dictionary_based/_borf.py diff --git a/aeon/transformations/collection/dictionary_based/_borf.py b/aeon/transformations/collection/dictionary_based/_borf.py new file mode 100644 index 0000000000..20a716cc89 --- /dev/null +++ b/aeon/transformations/collection/dictionary_based/_borf.py @@ -0,0 +1,1145 @@ +"""BORF transformer. + +multivariate dictionary based transformer based on Bag-Of-Receptive-Fields transform. +""" + +__maintainer__ = [] +__all__ = ["BORF"] + +import itertools +import math +from collections.abc import Sequence +from typing import Literal, Optional + +import numba as nb +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.pipeline import FeatureUnion +from sklearn.pipeline import make_pipeline as make_pipeline_sklearn + +from aeon.transformations.collection import BaseCollectionTransformer + + +class BORF(BaseCollectionTransformer): + """ + Bag-of-Receptive-Fields (BORF) Transformer. + + Transforms time series data into a bag-of-receptive-fields representation [1] using + SAX. This transformer extracts SAX words from time series by applying sliding + windows, discretizing, and counting the occurrences of the SAX words in the + time series. The output is a sparse feature representation suitable for + various downstream tasks such as classification and regression. + + Parameters + ---------- + window_size_min : int, optional (default=4) + The minimum window size for the sliding window. + window_size_max : int or None, optional (default=None) + The maximum window size for the sliding window. If None, it is set + via the heuristic function, based on the time series length. + word_lengths : int, optional (default=4) + The length of the words used in the SAX representation. + alphabets_min : int, optional (default=3) + The minimum number of symbols in the alphabet. + alphabets_max : int, optional (default=4) + The maximum number of symbols in the alphabet. + alphabets_step : int, optional (default=1) + The step size when iterating over the number of symbols in the alphabet. + dilations_min : int, optional (default=1) + The minimum dilation factor. + dilations_max : int or None, optional (default=None) + The maximum dilation factor. If None, it is set via the heuristic function, + based on the time series length. + min_window_to_signal_std_ratio : float, optional (default=0.0) + Minimum ratio of the window size to the standard deviation of the signal. + If the window stardard deviation is lower than this ratio, + the window is considered "flat" and set to all zeros. + This avoids blowing up noise. + n_jobs : int, optional (default=1) + The number of IndividualBORF to run in parallel. + n_jobs_numba : int, optional (default=1) + The number of threads to used for parallelizing + each configurations inside each IndividualBORF. + transformer_weights : array-like or None, optional (default=None) + Weights applied to each transformer in the pipeline. + complexity : {'quadratic', 'linear'}, optional (default='quadratic') + The computational complexity mode. 'quadratic' for higher accuracy + with more computational cost, 'linear' for faster computations. + + Attributes + ---------- + pipe_ : sklearn.pipeline.Pipeline + The internal pipeline used for transforming the data. + configs_ : dict + Configuration parameters used in the pipeline. + + References + ---------- + .. [1] Spinnato, F., Guidotti, R., Monreale, A., & Nanni, M. (2023). + A Bag of Receptive Fields for Time Series Extrinsic Predictions. + arXiv preprint arXiv:2311.18029. + + Examples + -------- + >>> from aeon.transformations.collection.dictionary_based import BORF + >>> from aeon.datasets import load_unit_test + >>> X, _ = load_unit_test(return_X_y=True) + >>> borf = BORF() # doctest: +SKIP + >>> borf.fit(X) # doctest: +SKIP + BORF() + >>> X_transformed = borf.transform(X) # doctest: +SKIP + + """ + + _tags = { + "X_inner_type": "numpy3D", + "capability:inverse_transform": False, + "capability:missing_values:removes": True, + "capability:multivariate": True, + "capability:multithreading": True, + "input_data_type": "Collection", + "algorithm_type": "dictionary", + "output_data_type": "Tabular", + "instancewise": False, + "requires_y": False, + "python_dependencies": "sparse", + } + + def __init__( + self, + window_size_min_window_size=4, + window_size_max_window_size=None, + word_lengths_n_word_lengths=4, + alphabets_min_symbols=3, + alphabets_max_symbols=4, + alphabets_step=1, + dilations_min_dilation=1, + dilations_max_dilation=None, + min_window_to_signal_std_ratio: float = 0.0, + n_jobs=1, + n_jobs_numba=1, + transformer_weights=None, + complexity: Literal["quadratic", "linear"] = "quadratic", + ): + self.window_size_min_window_size = window_size_min_window_size + self.window_size_max_window_size = window_size_max_window_size + self.word_lengths_n_word_lengths = word_lengths_n_word_lengths + self.alphabets_min_symbols = alphabets_min_symbols + self.alphabets_max_symbols = alphabets_max_symbols + self.alphabets_step = alphabets_step + self.dilations_min_dilation = dilations_min_dilation + self.dilations_max_dilation = dilations_max_dilation + self.min_window_to_signal_std_ratio = min_window_to_signal_std_ratio + self.n_jobs = n_jobs + self.n_jobs_numba = n_jobs_numba + self.transformer_weights = transformer_weights + self.complexity = complexity + super().__init__() + + def _fit(self, X, y=None): + time_series_length = X.shape[2] + # for better computation time, this should be moved to the init, + # setting time_series_length as a user parameter + self.pipe_, self.configs_ = build_pipeline_auto( + time_series_min_length=time_series_length, + time_series_max_length=time_series_length, + window_size_min_window_size=self.window_size_min_window_size, + window_size_max_window_size=self.window_size_max_window_size, + word_lengths_n_word_lengths=self.word_lengths_n_word_lengths, + alphabets_min_symbols=self.alphabets_min_symbols, + alphabets_max_symbols=self.alphabets_max_symbols, + alphabets_step=self.alphabets_step, + dilations_min_dilation=self.dilations_min_dilation, + dilations_max_dilation=self.dilations_max_dilation, + min_window_to_signal_std_ratio=self.min_window_to_signal_std_ratio, + n_jobs=self.n_jobs, + n_jobs_numba=self.n_jobs_numba, + transformer_weights=self.transformer_weights, + pipeline_objects=[ + (ReshapeTo2D, dict()), + (ZeroColumnsRemover, dict()), + (ToScipySparse, dict()), + ], + complexity=self.complexity, + ) + self.pipe_.fit(X) + return self + + def _transform(self, X, y=None): + return self.pipe_.transform(X) + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return the "default" set. + + Returns + ------- + dict or list of dict + Parameters to create testing instances of the class. + Each dict contains parameters to construct an "interesting" test instance, + i.e., `MyClass(**params)` or `MyClass(**params[i])` creates a valid test + instance. `create_test_instance` uses the first (or only) dictionary in + `params`. + """ + params = { + "window_size_min_window_size": 4, + "window_size_max_window_size": None, + "word_lengths_n_word_lengths": 4, + "alphabets_min_symbols": 3, + "alphabets_max_symbols": 4, + "alphabets_step": 1, + "dilations_min_dilation": 1, + "dilations_max_dilation": None, + "min_window_to_signal_std_ratio": 0.0, + "n_jobs": 1, + "n_jobs_numba": 1, + "transformer_weights": None, + "complexity": "quadratic", + } + return params + + +def build_pipeline_auto( + time_series_min_length: int, + time_series_max_length: int, + window_size_min_window_size=4, + window_size_max_window_size=None, + word_lengths_n_word_lengths=4, + alphabets_min_symbols=3, + alphabets_max_symbols=4, + alphabets_step=1, + dilations_min_dilation=1, + dilations_max_dilation=None, + min_window_to_signal_std_ratio: float = 0.0, + n_jobs=1, + n_jobs_numba=1, + transformer_weights=None, + pipeline_objects: Optional[Sequence[tuple]] = None, + complexity: Literal["quadratic", "linear"] = "quadratic", +): + configs = heuristic_function_sax( + time_series_min_length=time_series_min_length, + time_series_max_length=time_series_max_length, + window_size_min_window_size=window_size_min_window_size, + window_size_max_window_size=window_size_max_window_size, + word_lengths_n_word_lengths=word_lengths_n_word_lengths, + alphabets_min_symbols=alphabets_min_symbols, + alphabets_max_symbols=alphabets_max_symbols, + alphabets_step=alphabets_step, + dilations_min_dilation=dilations_min_dilation, + dilations_max_dilation=dilations_max_dilation, + complexity=complexity, + ) + + return ( + build_pipeline( + configs=configs, + min_window_to_signal_std_ratio=min_window_to_signal_std_ratio, + n_jobs=n_jobs, + n_jobs_numba=n_jobs_numba, + transformer_weights=transformer_weights, + pipeline_objects=pipeline_objects, + ), + configs, + ) + + +def build_pipeline( + configs, + min_window_to_signal_std_ratio: float = 0.0, + n_jobs_numba=1, + n_jobs=1, + transformer_weights=None, + pipeline_objects: Optional[Sequence[tuple]] = None, +): + transformers = list() + if pipeline_objects is None: + pipeline_objects = list() + for config in configs: + # alphabet_size, window_size, word_length, dilation, stride + borf = IndividualBORF( + **config, + min_window_to_signal_std_ratio=min_window_to_signal_std_ratio, + n_jobs=n_jobs_numba, + ) + transformer = make_pipeline_sklearn( + borf, *[obj(**kwargs) for obj, kwargs in pipeline_objects] + ) + transformers.append(transformer) + union = FeatureUnion( + transformer_list=[(str(i), transformers[i]) for i in range(len(transformers))], + n_jobs=n_jobs, + transformer_weights=transformer_weights, + ) + return union + + +class IndividualBORF(BaseEstimator, TransformerMixin): + def __init__( + self, + window_size=4, + dilation=1, + alphabet_size: int = 3, + word_length: int = 2, + stride: int = 1, + min_window_to_signal_std_ratio: float = 0.0, + n_jobs: int = 1, + prefix="", + ): + self.window_size = window_size + self.dilation = dilation + self.word_length = word_length + self.stride = stride + self.alphabet_size = alphabet_size + self.min_window_to_signal_std_ratio = min_window_to_signal_std_ratio + self.prefix = prefix + self.n_jobs = n_jobs + self.n_words = convert_to_base_10( + array_to_int(np.full(self.word_length, self.alphabet_size - 1)) + 1, + base=self.alphabet_size, + ) + set_n_jobs_numba(n_jobs=self.n_jobs) + + def fit(self, X, y=None): + return self + + def transform(self, X, y=None): + import sparse # FIXME: can we move this outside for better performance? + + shape_ = (len(X), len(X[0]), self.n_words) + out = transform_sax_patterns( + panel=X, + window_size=self.window_size, + dilation=self.dilation, + alphabet_size=self.alphabet_size, + word_length=self.word_length, + stride=self.stride, + min_window_to_signal_std_ratio=self.min_window_to_signal_std_ratio, + ) + # ts_idx, signal_idx, words, count + return sparse.COO(coords=out[:, :3].T, data=out[:, -1].T, shape=shape_) + + +class ZeroColumnsRemover(BaseEstimator, TransformerMixin): + def __init__(self, axis=0): + self.axis = axis + + self.n_original_columns_ = None + self.columns_to_keep_ = None + + def fit(self, X, y=None): + self.n_original_columns_ = X.shape[1] + self.columns_to_keep_ = np.argwhere(X.any(axis=self.axis)).ravel() + return self + + def transform(self, X): + return X[..., self.columns_to_keep_] + + +class ReshapeTo2D(BaseEstimator, TransformerMixin): + def __init__(self, keep_unraveled_index=False): + self.keep_unraveled_index = keep_unraveled_index + # shape: (n_flattened_features, 2) -> flattened index -> (dimension, word) + self.unraveled_index_ = None + self.original_shape_ = None + + def fit(self, X, y=None): + self.original_shape_ = X.shape + if self.keep_unraveled_index: + self.unraveled_index_ = np.hstack( + [np.unravel_index(np.arange(np.prod(X.shape[1:])), X.shape[1:])] + ).T + return self + + def transform(self, X): + return X.reshape((X.shape[0], -1)) + + +class ToScipySparse(BaseEstimator, TransformerMixin): + def __init__(self): + pass + + def fit(self, X, y=None): + return self + + def transform(self, X): + return X.to_scipy_sparse() + + +@nb.njit(fastmath=True, cache=True) +def convert_to_base_10(number, base): + result = 0 + multiplier = 1 + + while number > 0: + digit = number % 10 + result += digit * multiplier + multiplier *= base + number //= 10 + + return result + + +@nb.njit(cache=True) +def array_to_int(arr): + result = 0 + for i in range(len(arr)): + result = result * 10 + arr[i] + return result + + +def set_n_jobs_numba(n_jobs): + if n_jobs == -1: + nb.set_num_threads(nb.config.NUMBA_DEFAULT_NUM_THREADS) + else: + nb.set_num_threads(n_jobs) + + +@nb.njit(parallel=True, nogil=True, cache=True) +def transform_sax_patterns( + panel, + window_size, + word_length, + alphabet_size, + stride, + dilation, + min_window_to_signal_std_ratio=0.0, +): + bins = get_norm_bins(alphabet_size=alphabet_size) + n_signals = len(panel[0]) + n_ts = len(panel) + iterations = n_ts * n_signals + counts = np.zeros(iterations + 1, dtype=np.int64) + for i in nb.prange(iterations): + ts_idx, signal_idx = ndindex_2d_array(i, n_signals) + signal = np.asarray(panel[ts_idx][signal_idx]) + signal = signal[~np.isnan(signal)] + if not are_window_size_and_dilation_compatible_with_signal_length( + window_size, dilation, signal.size + ): + continue + counts[i + 1] = len( + new_transform_single_conf( + a=signal, + ts_idx=ts_idx, + signal_idx=signal_idx, + window_size=window_size, + word_length=word_length, + alphabet_size=alphabet_size, + bins=bins, + dilation=dilation, + stride=stride, + min_window_to_signal_std_ratio=min_window_to_signal_std_ratio, + ) + ) + cum_counts = np.cumsum(counts) + n_rows = np.sum(counts) + shape = (n_rows, 4) + out = np.empty(shape, dtype=np.int64) + for i in nb.prange(iterations): + ts_idx, signal_idx = ndindex_2d_array(i, n_signals) + signal = np.asarray(panel[ts_idx][signal_idx]) + signal = signal[~np.isnan(signal)] + if not are_window_size_and_dilation_compatible_with_signal_length( + window_size, dilation, signal.size + ): + continue + out_ = new_transform_single_conf( + a=signal, + ts_idx=ts_idx, + signal_idx=signal_idx, + window_size=window_size, + word_length=word_length, + alphabet_size=alphabet_size, + bins=bins, + dilation=dilation, + stride=stride, + min_window_to_signal_std_ratio=min_window_to_signal_std_ratio, + ) + out[cum_counts[i] : cum_counts[i + 1], :] = out_ + return out + + +@nb.njit(cache=True) +def new_transform_single_conf( + a, + ts_idx, + signal_idx, + window_size, + word_length, + alphabet_size, + bins, + dilation, + stride=1, + min_window_to_signal_std_ratio=0.0, +): + words, counts = new_transform_single( + a=a, + window_size=window_size, + word_length=word_length, + alphabet_size=alphabet_size, + bins=bins, + dilation=dilation, + stride=stride, + min_window_to_signal_std_ratio=min_window_to_signal_std_ratio, + ) + ts_idxs = np.full(len(words), ts_idx) + signal_idxs = np.full(len(words), signal_idx) + return np.column_stack((ts_idxs, signal_idxs, words, counts)) + + +@nb.njit(cache=True) +def new_transform_single( + a, + window_size, + word_length, + alphabet_size, + bins, + dilation, + stride=1, + min_window_to_signal_std_ratio=0.0, +): + sax_words = sax( + a=a, + window_size=window_size, + word_length=word_length, + bins=bins, + min_window_to_signal_std_ratio=min_window_to_signal_std_ratio, + dilation=dilation, + stride=stride, + ) + sax_words = sax_words_to_int(sax_words, alphabet_size) + return unique(sax_words) + + +@nb.njit(cache=True) +def ndindex_2d_array(idx, dim2_shape): + row_idx = idx // dim2_shape + col_idx = idx % dim2_shape + return row_idx, col_idx + + +@nb.njit(cache=True) +def get_norm_bins(alphabet_size: int, mu=0, std=1): + return ppf(np.linspace(0, 1, alphabet_size + 1)[1:-1], mu, std) + + +@nb.njit(fastmath=True, cache=True) +def erfinv(x: float) -> float: + w = -math.log((1 - x) * (1 + x)) + if w < 5: + w = w - 2.5 + p = 2.81022636e-08 + p = 3.43273939e-07 + p * w + p = -3.5233877e-06 + p * w + p = -4.39150654e-06 + p * w + p = 0.00021858087 + p * w + p = -0.00125372503 + p * w + p = -0.00417768164 + p * w + p = 0.246640727 + p * w + p = 1.50140941 + p * w + else: + w = math.sqrt(w) - 3 + p = -0.000200214257 + p = 0.000100950558 + p * w + p = 0.00134934322 + p * w + p = -0.00367342844 + p * w + p = 0.00573950773 + p * w + p = -0.0076224613 + p * w + p = 0.00943887047 + p * w + p = 1.00167406 + p * w + p = 2.83297682 + p * w + return p * x + + +@nb.vectorize(cache=True) +def ppf(x, mu=0, std=1): + return mu + math.sqrt(2) * erfinv(2 * x - 1) * std + + +@nb.njit(fastmath=True, cache=True) +def are_window_size_and_dilation_compatible_with_signal_length( + window_size, dilation, signal_length +): + if window_size + (window_size - 1) * (dilation - 1) <= signal_length: + return True + else: + return False + + +@nb.njit(cache=True) +def sax( + a, + window_size, + word_length, + bins, + stride=1, + dilation=1, + min_window_to_signal_std_ratio=0.0, +): + n_windows = get_n_windows( + sequence_size=a.size, window_size=window_size, dilation=dilation, stride=stride + ) + n_windows_moving = get_n_windows( + sequence_size=a.size, window_size=window_size, dilation=dilation + ) + global_std = np.std(a) + if global_std == 0: + return np.zeros((n_windows, word_length), dtype=np.uint8) + seg_size = window_size // word_length + n_windows = get_n_windows( + sequence_size=a.size, window_size=window_size, dilation=dilation, stride=stride + ) + n_segments = get_n_windows( + sequence_size=a.size, window_size=seg_size, dilation=dilation + ) + segment_means = np.full(n_segments, np.nan) + window_means = np.full(n_windows_moving, np.nan) + window_stds = np.full(n_windows_moving, np.nan) + for d in range(dilation): + window_means[d::dilation] = move_mean(a[d::dilation], window_size)[ + window_size - 1 : + ] + window_stds[d::dilation] = move_std(a[d::dilation], window_size)[ + window_size - 1 : + ] + segment_means[d::dilation] = move_mean(a[d::dilation], seg_size)[seg_size - 1 :] + out = np.zeros((n_windows, word_length)) + for i in range(n_windows): + for j in range(word_length): + out[i, j] = zscore_threshold( + a=segment_means[(i * stride) + (j * seg_size * dilation)], + mu=window_means[i * stride], + sigma=window_stds[i * stride], + sigma_global=global_std, + sigma_threshold=min_window_to_signal_std_ratio, + ) + return np.digitize(out, bins).astype(np.uint8) + + +@nb.njit(fastmath=True, cache=True) +def get_n_windows(sequence_size, window_size, dilation=1, stride=1, padding=0): + return 1 + math.floor( + (sequence_size + 2 * padding - window_size - (dilation - 1) * (window_size - 1)) + / stride + ) + + +@nb.njit(fastmath=True, cache=True) +def zscore_threshold( + a: float, mu: float, sigma: float, sigma_global: float, sigma_threshold: float +) -> float: + if sigma_global == 0: + return 0 + if sigma / sigma_global < sigma_threshold: + return 0 + return zscore(a=a, mu=mu, sigma=sigma) + + +@nb.njit(fastmath=True, cache=True) +def zscore(a: float, mu: float, sigma: float) -> float: + if sigma == 0: + return 0 + return (a - mu) / sigma + + +@nb.njit(fastmath=True, cache=True) +def move_mean(a, window_width): + out = np.empty_like(a) + asum = 0.0 + count = 0 + + # Calculating the initial moving window sum + for i in range(window_width): + asum += a[i] + count += 1 + out[i] = asum / count + + # Moving window + for i in range(window_width, len(a)): + asum += a[i] - a[i - window_width] + out[i] = asum / window_width + return out + + +@nb.njit(fastmath=True, cache=True) +def move_std(a, window_width, ddof=0): + out = np.empty_like(a, dtype=np.float64) + + # Initial mean and variance calculation + mean = 0.0 + M2 = 0.0 + for i in range(window_width): + delta = a[i] - mean + mean += delta / (i + 1) + delta2 = a[i] - mean + M2 += delta * delta2 + + # Adjusting for degrees of freedom + if window_width - ddof > 0: + variance = M2 / (window_width - ddof) + else: + variance = 0 # Avoid division by zero + + out[window_width - 1] = np.sqrt(max(variance, 0)) + + # Moving window + for i in range(window_width, len(a)): + x0 = a[i - window_width] + xn = a[i] + + new_avg = mean + (xn - x0) / window_width + + # Update the variance using the new degrees of freedom + if window_width - ddof > 0: + new_var = variance + (xn - new_avg + x0 - mean) * (xn - x0) / ( + window_width - ddof + ) + else: + new_var = 0 # Avoid division by zero + + out[i] = np.sqrt(max(new_var, 0)) # TODO: investigate negative variance + + mean = new_avg + variance = new_var + + out[: window_width - 1] = np.nan + + return out + + +@nb.njit(fastmath=True, cache=True) +def sax_words_to_int(arrays, base): + out = np.empty(arrays.shape[0], dtype=np.int64) + for i in range(arrays.shape[0]): + out[i] = array_to_int_new_base(arrays[i], base) + return out + + +@nb.njit(fastmath=True, cache=True) +def array_to_int_new_base(array, base): + word_length = array.shape[0] + result = 0 + for i in range(0, word_length, 1): + result += array[i] * base ** (word_length - i - 1) + return result + + +@nb.njit(cache=True) +def length(a): + a = int(np.ceil(np.log2(a))) + a = 2 << a + return a + + +@nb.njit(cache=True) +def hash_function(v): + + byte_mask = np.uint64(255) + bs = np.uint64(v) + x1 = (bs) & byte_mask + x2 = (bs >> 8) & byte_mask + x3 = (bs >> 16) & byte_mask + x4 = (bs >> 24) & byte_mask + x5 = (bs >> 32) & byte_mask + x6 = (bs >> 40) & byte_mask + x7 = (bs >> 48) & byte_mask + x8 = (bs >> 56) & byte_mask + + FNV_primer = np.uint64(1099511628211) + FNV_bias = np.uint64(14695981039346656037) + h = FNV_bias + h = h * FNV_primer + h = h ^ x1 + h = h * FNV_primer + h = h ^ x2 + h = h * FNV_primer + h = h ^ x3 + h = h * FNV_primer + h = h ^ x4 + h = h * FNV_primer + h = h ^ x5 + h = h * FNV_primer + h = h ^ x6 + h = h * FNV_primer + h = h ^ x7 + h = h * FNV_primer + h = h ^ x8 + return h + + +@nb.njit(cache=True) +def make_hash_table(ar): + a = length(len(ar)) + mask = a - 1 + + uniques = np.empty(a, dtype=ar.dtype) + uniques_cnt = np.zeros(a, dtype=np.int_) + return uniques, uniques_cnt, a, mask + + +@nb.njit(cache=True) +def set_item(uniques, uniques_cnt, mask, h, v, total, miss_hits, weight): + index = h & mask + while True: + if uniques_cnt[index] == 0: + # insert new + uniques_cnt[index] += weight + uniques[index] = v + total += 1 + break + elif uniques[index] == v: + uniques_cnt[index] += weight + break + else: + miss_hits += 1 + index += 1 + index = index & mask + return total, miss_hits + + +@nb.njit(cache=True) +def concrete(ar, uniques, uniques_cnt, a, total): + # flush the results in a concrete array + uniques_ = np.empty(total, dtype=ar.dtype) + uniques_cnt_ = np.empty(total, dtype=np.int_) + t = 0 + for i in range(a): + if uniques_cnt[i] > 0: + uniques_[t] = uniques[i] + uniques_cnt_[t] = uniques_cnt[i] + t += 1 + return uniques_, uniques_cnt_ + + +@nb.njit(cache=True) +def unique(ar): + # https://github.com/lhprojects/blog/blob/master/_posts/JupyterNotebooks/HashUnique.ipynb + uniques, uniques_cnt, l, mask = make_hash_table(ar) + total = 0 + miss_hits = 0 + for v in ar: + h = hash_function(v) + total, miss_hits = set_item( + uniques, uniques_cnt, mask, h, v, total, miss_hits, 1 + ) + uniques_, uniques_cnt_ = concrete(ar, uniques, uniques_cnt, l, total) + return uniques_, uniques_cnt_ + + +def get_borf_params( + time_series_min_length, + time_series_max_length, + window_size_min_window_size=4, + window_size_max_window_size=None, + window_size_power=2, + word_lengths_n_word_lengths=4, + strides_n_strides=1, + alphabets_mean_min_symbols=2, + alphabets_mean_max_symbols=3, + alphabets_mean_step=1, + alphabets_slope_min_symbols=3, + alphabets_slope_max_symbols=4, + alphabets_slope_step=1, + dilations_min_dilation=1, + dilations_max_dilation=None, +): + params = {} + params["window_sizes"] = get_window_sizes( + m_min=time_series_min_length, + m_max=time_series_max_length, + min_window_size=window_size_min_window_size, + max_window_size=window_size_max_window_size, + power=window_size_power, + ).tolist() + + params["word_lengths"] = get_word_lengths( + n_word_lengths=word_lengths_n_word_lengths, + start=0, + ).tolist() + + params["dilations"] = get_dilations( + max_length=time_series_max_length, + min_dilation=dilations_min_dilation, + max_dilation=dilations_max_dilation, + ).tolist() + params["strides"] = get_strides(n_strides=strides_n_strides).tolist() + params["alphabet_sizes_slope"] = get_alphabet_sizes( + min_symbols=alphabets_slope_min_symbols, + max_symbols=alphabets_slope_max_symbols, + step=alphabets_slope_step, + ).tolist() + params["alphabet_sizes_mean"] = get_alphabet_sizes( + min_symbols=alphabets_mean_min_symbols, + max_symbols=alphabets_mean_max_symbols, + step=alphabets_mean_step, + ).tolist() + return params + + +@nb.njit(cache=True) +def is_empty(a) -> bool: + return a.size == 0 + + +def get_window_sizes(m_min, m_max, min_window_size=4, max_window_size=None, power=2): + if max_window_size is None: + max_window_size = m_max + m = 2 + windows = list() + windows_min = list() + while m <= max_window_size: + if m < min_window_size: + windows_min.append(m) + else: + windows.append(m) + m = int(m * power) + windows = np.array(windows) + windows_min = np.array(windows_min[1:]) + if not is_empty(windows_min): + if m_min <= windows_min.max() * power: + windows = np.concatenate([windows_min, windows]) + return windows.astype(int) + + +def get_word_lengths(n_word_lengths=4, start=0): + return np.array([2**i for i in range(start, n_word_lengths + start)]) + + +def get_dilations(max_length, min_dilation=1, max_dilation=None): + dilations = list() + max_length_log2 = np.log2(max_length) + if max_dilation is None: + max_dilation = max_length_log2 + start = min_dilation + while start <= max_dilation: + dilations.append(start) + start *= 2 + return np.array(dilations) + + +def get_strides(n_strides=1): + return np.arange(1, n_strides + 1) + + +def get_alphabet_sizes(min_symbols=3, max_symbols=4, step=1): + return np.arange(min_symbols, max_symbols, step) + + +def generate_sax_parameters_configurations( + window_sizes, + strides, + dilations, + word_lengths, + alphabet_sizes_mean, + alphabet_sizes_slope, +): + parameters = list( + itertools.product( + *[ + window_sizes, + strides, + dilations, + word_lengths, + alphabet_sizes_mean, + alphabet_sizes_slope, + ] + ) + ) + cleaned_parameters = list() + for parameter in parameters: + ( + window_size, + stride, + dilation, + word_length, + alphabet_mean, + alphabet_slope, + ) = extract_parameters_from_args(parameter) + if word_length > window_size: # word_length cannot be greater than window_size + continue + if ( + alphabet_slope <= 1 and word_length == 1 + ): # if alphabet_slope <= 1, word_length=1 is useless + continue + cleaned_parameters.append( + dict( + window_size=window_size, + stride=stride, + dilation=dilation, + word_length=word_length, + alphabet_size_mean=alphabet_mean, + alphabet_size_slope=alphabet_slope, + ) + ) + return cleaned_parameters + + +@nb.njit(cache=True) +def is_valid_windowing(sequence_size: int, window_size: int, dilation: int) -> bool: + if ( + sequence_size < window_size * dilation + ): # if window_size * dilation exceeds the length of the sequence + return False + else: + return True + + +def clean_sax_parameters_configurations(parameters, max_length): + cleaned_parameters = list() + for parameter in parameters: + window_size = parameter["window_size"] + dilation = parameter["dilation"] + word_length = parameter["word_length"] + alphabet_mean = parameter["alphabet_size_mean"] + alphabet_slope = parameter["alphabet_size_slope"] + stride = parameter["stride"] + if not is_valid_windowing( + window_size=window_size, sequence_size=max_length, dilation=dilation + ): + continue + # if window_size == word_length: + # continue # FIXME: should I do this? + cleaned_parameters.append( + dict( + window_size=window_size, + stride=stride, + dilation=dilation, + word_length=word_length, + alphabet_size_mean=alphabet_mean, + alphabet_size_slope=alphabet_slope, + ) + ) + return cleaned_parameters + + +def sax_parameters_configurations_linear_strides(parameters): + new_parameters = list() + for parameter in parameters: + window_size = parameter["window_size"] + dilation = parameter["dilation"] + word_length = parameter["word_length"] + alphabet_mean = parameter["alphabet_size_mean"] + alphabet_slope = parameter["alphabet_size_slope"] + parameter = dict( + window_size=window_size, + stride=word_length, + dilation=dilation, + word_length=word_length, + alphabet_size_mean=alphabet_mean, + alphabet_size_slope=alphabet_slope, + ) + new_parameters.append(parameter) + return new_parameters + + +def extract_parameters_from_args(parameter): + window_size = parameter[0] + stride = parameter[1] + dilation = parameter[2] + word_length = parameter[3] + alphabet_mean = parameter[4] + alphabet_slope = parameter[5] + return window_size, stride, dilation, word_length, alphabet_mean, alphabet_slope + + +def heuristic_function_1dsax( + time_series_min_length, + time_series_max_length, + window_size_min_window_size=4, + window_size_max_window_size=None, + window_size_power=2, + word_lengths_n_word_lengths=3, + strides_n_strides=1, + alphabets_mean_min_symbols=2, + alphabets_mean_max_symbols=3, + alphabets_mean_step=1, + alphabets_slope_min_symbols=2, + alphabets_slope_max_symbols=3, + alphabets_slope_step=1, + dilations_min_dilation=1, + dilations_max_dilation=None, + complexity: Literal["quadratic", "linear"] = "quadratic", +): + params = get_borf_params( + time_series_min_length=time_series_min_length, + time_series_max_length=time_series_max_length, + window_size_min_window_size=window_size_min_window_size, + window_size_max_window_size=window_size_max_window_size, + window_size_power=window_size_power, + word_lengths_n_word_lengths=word_lengths_n_word_lengths, + strides_n_strides=strides_n_strides, + alphabets_slope_min_symbols=alphabets_slope_min_symbols, + alphabets_slope_max_symbols=alphabets_slope_max_symbols, + alphabets_slope_step=alphabets_slope_step, + alphabets_mean_min_symbols=alphabets_mean_min_symbols, + alphabets_mean_max_symbols=alphabets_mean_max_symbols, + alphabets_mean_step=alphabets_mean_step, + dilations_min_dilation=dilations_min_dilation, + dilations_max_dilation=dilations_max_dilation, + ) + + params_list = generate_sax_parameters_configurations( + window_sizes=params["window_sizes"], + strides=params["strides"], + dilations=params["dilations"], + word_lengths=params["word_lengths"], + alphabet_sizes_mean=params["alphabet_sizes_mean"], + alphabet_sizes_slope=params["alphabet_sizes_slope"], + ) + + cleaned_params_list = clean_sax_parameters_configurations( + parameters=params_list, max_length=time_series_max_length + ) + + if complexity == "linear": + cleaned_params_list = sax_parameters_configurations_linear_strides( + parameters=cleaned_params_list + ) + + return cleaned_params_list + + +def heuristic_function_sax( + time_series_min_length, + time_series_max_length, + window_size_min_window_size=4, + window_size_max_window_size=None, + window_size_power=2, + word_lengths_n_word_lengths=4, + strides_n_strides=1, + alphabets_min_symbols=2, + alphabets_max_symbols=3, + alphabets_step=1, + dilations_min_dilation=1, + dilations_max_dilation=None, + complexity: Literal["quadratic", "linear"] = "quadratic", +): + configs = heuristic_function_1dsax( + time_series_min_length=time_series_min_length, + time_series_max_length=time_series_max_length, + window_size_min_window_size=window_size_min_window_size, + window_size_max_window_size=window_size_max_window_size, + window_size_power=window_size_power, + word_lengths_n_word_lengths=word_lengths_n_word_lengths, + strides_n_strides=strides_n_strides, + alphabets_slope_min_symbols=0, + alphabets_slope_max_symbols=1, + alphabets_slope_step=1, + alphabets_mean_min_symbols=alphabets_min_symbols, + alphabets_mean_max_symbols=alphabets_max_symbols, + alphabets_mean_step=alphabets_step, + dilations_min_dilation=dilations_min_dilation, + dilations_max_dilation=dilations_max_dilation, + complexity=complexity, + ) + for config in configs: + config.pop("alphabet_size_slope") + config["alphabet_size"] = config.pop("alphabet_size_mean") + return configs From 1de9768d066febc86d590432511071a002507e23 Mon Sep 17 00:00:00 2001 From: francesco Date: Mon, 16 Sep 2024 15:59:47 +0200 Subject: [PATCH 04/19] Add maintainer --- aeon/transformations/collection/dictionary_based/_borf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aeon/transformations/collection/dictionary_based/_borf.py b/aeon/transformations/collection/dictionary_based/_borf.py index 20a716cc89..86dafab0a8 100644 --- a/aeon/transformations/collection/dictionary_based/_borf.py +++ b/aeon/transformations/collection/dictionary_based/_borf.py @@ -3,7 +3,7 @@ multivariate dictionary based transformer based on Bag-Of-Receptive-Fields transform. """ -__maintainer__ = [] +__maintainer__ = ["fspinna"] __all__ = ["BORF"] import itertools From 87e32aa8f01f7cc88c7f5e8a1d5aac33aa89d11f Mon Sep 17 00:00:00 2001 From: francesco Date: Mon, 16 Sep 2024 15:59:59 +0200 Subject: [PATCH 05/19] Update .all-contributorsrc --- .all-contributorsrc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.all-contributorsrc b/.all-contributorsrc index 06088d94f9..39c9b816f5 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -2529,6 +2529,16 @@ "contributions": [ "code" ] + }, + , + { + "login": "fspinna", + "name": "Francesco Spinnato", + "avatar_url": "https://avatars.githubusercontent.com/u/35352023?v=4", + "profile": "https://github.com/fspinna", + "contributions": [ + "code" + ] } ], "commitType": "docs" From 0781ca3e86ab0ba71b9e6e70849cb92ae4eed988 Mon Sep 17 00:00:00 2001 From: francesco Date: Mon, 16 Sep 2024 16:00:14 +0200 Subject: [PATCH 06/19] Update transformations.rst --- docs/api_reference/transformations.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api_reference/transformations.rst b/docs/api_reference/transformations.rst index d56e9c6a23..22705145f7 100644 --- a/docs/api_reference/transformations.rst +++ b/docs/api_reference/transformations.rst @@ -119,6 +119,7 @@ Dictionary-based features PAA SFA SAX + BORF Signature-based features ~~~~~~~~~~~~~~~~~~~~~~~~ From 3d33aff75364245631840897369afee84812a01e Mon Sep 17 00:00:00 2001 From: francesco Date: Mon, 16 Sep 2024 16:05:21 +0200 Subject: [PATCH 07/19] Update .all-contributorsrc --- .all-contributorsrc | 1 - 1 file changed, 1 deletion(-) diff --git a/.all-contributorsrc b/.all-contributorsrc index 39c9b816f5..9ea0cf19e2 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -2530,7 +2530,6 @@ "code" ] }, - , { "login": "fspinna", "name": "Francesco Spinnato", From 727828c7b977e996b9718cc04f27bbcddf9e8970 Mon Sep 17 00:00:00 2001 From: francesco Date: Tue, 17 Sep 2024 09:36:12 +0200 Subject: [PATCH 08/19] Fix docstring style and variable names --- .../collection/dictionary_based/_borf.py | 47 ++++++++++--------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_borf.py b/aeon/transformations/collection/dictionary_based/_borf.py index 86dafab0a8..a2500ccc7b 100644 --- a/aeon/transformations/collection/dictionary_based/_borf.py +++ b/aeon/transformations/collection/dictionary_based/_borf.py @@ -32,39 +32,40 @@ class BORF(BaseCollectionTransformer): Parameters ---------- - window_size_min : int, optional (default=4) + window_size_min_window_size : int, default=4 The minimum window size for the sliding window. - window_size_max : int or None, optional (default=None) + window_size_max_window_size : int or None, default=None The maximum window size for the sliding window. If None, it is set - via the heuristic function, based on the time series length. - word_lengths : int, optional (default=4) + via a heuristic function based on the time series length. + word_lengths_n_word_lengths : int, default=4 The length of the words used in the SAX representation. - alphabets_min : int, optional (default=3) + alphabets_min_symbols : int, default=3 The minimum number of symbols in the alphabet. - alphabets_max : int, optional (default=4) + alphabets_max_symbols : int, default=4 The maximum number of symbols in the alphabet. - alphabets_step : int, optional (default=1) + alphabets_step : int, default=1 The step size when iterating over the number of symbols in the alphabet. - dilations_min : int, optional (default=1) + dilations_min_dilation : int, default=1 The minimum dilation factor. - dilations_max : int or None, optional (default=None) - The maximum dilation factor. If None, it is set via the heuristic function, + dilations_max_dilation : int or None, default=None + The maximum dilation factor. If None, it is set via a heuristic function based on the time series length. - min_window_to_signal_std_ratio : float, optional (default=0.0) + min_window_to_signal_std_ratio : float, default=0.0 Minimum ratio of the window size to the standard deviation of the signal. - If the window stardard deviation is lower than this ratio, - the window is considered "flat" and set to all zeros. - This avoids blowing up noise. - n_jobs : int, optional (default=1) - The number of IndividualBORF to run in parallel. - n_jobs_numba : int, optional (default=1) - The number of threads to used for parallelizing - each configurations inside each IndividualBORF. - transformer_weights : array-like or None, optional (default=None) + If the window standard deviation is lower than this ratio, + the window is considered "flat" and set to all zeros to avoid amplifying noise. + n_jobs : int, default=1 + The number of `IndividualBORF` instances to run in parallel. + n_jobs_numba : int, default=1 + The number of threads used for parallelizing each + configuration inside each `IndividualBORF`. + transformer_weights : array-like or None, default=None Weights applied to each transformer in the pipeline. - complexity : {'quadratic', 'linear'}, optional (default='quadratic') - The computational complexity mode. 'quadratic' for higher accuracy - with more computational cost, 'linear' for faster computations. + complexity : {'quadratic', 'linear'}, default='quadratic' + The computational complexity mode: + - `'quadratic'`: Higher accuracy with more computational cost. + - `'linear'`: Faster computations with potentially lower accuracy. + Attributes ---------- From f4b19ad115c8e1d1fc29d10b2193485fa0a6163e Mon Sep 17 00:00:00 2001 From: francesco Date: Sat, 21 Sep 2024 10:57:15 +0200 Subject: [PATCH 09/19] Change BORF reference in Docstring --- aeon/transformations/collection/dictionary_based/_borf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_borf.py b/aeon/transformations/collection/dictionary_based/_borf.py index a2500ccc7b..206e9f4cd8 100644 --- a/aeon/transformations/collection/dictionary_based/_borf.py +++ b/aeon/transformations/collection/dictionary_based/_borf.py @@ -76,9 +76,10 @@ class BORF(BaseCollectionTransformer): References ---------- - .. [1] Spinnato, F., Guidotti, R., Monreale, A., & Nanni, M. (2023). - A Bag of Receptive Fields for Time Series Extrinsic Predictions. - arXiv preprint arXiv:2311.18029. + .. [1] F. Spinnato, R. Guidotti, A. Monreale and M. Nanni, + "Fast, Interpretable and Deterministic Time Series Classification + with a Bag-Of-Receptive-Fields," in IEEE Access, + doi: 10.1109/ACCESS.2024.3464743 Examples -------- From 245003bdbddc5d1ae720f666843161b495c62a69 Mon Sep 17 00:00:00 2001 From: francesco Date: Tue, 15 Oct 2024 11:02:38 +0200 Subject: [PATCH 10/19] Change public functions to private --- .../collection/dictionary_based/_borf.py | 182 +++++++++--------- 1 file changed, 92 insertions(+), 90 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_borf.py b/aeon/transformations/collection/dictionary_based/_borf.py index 206e9f4cd8..75832729dc 100644 --- a/aeon/transformations/collection/dictionary_based/_borf.py +++ b/aeon/transformations/collection/dictionary_based/_borf.py @@ -142,7 +142,7 @@ def _fit(self, X, y=None): time_series_length = X.shape[2] # for better computation time, this should be moved to the init, # setting time_series_length as a user parameter - self.pipe_, self.configs_ = build_pipeline_auto( + self.pipe_, self.configs_ = _build_pipeline_auto( time_series_min_length=time_series_length, time_series_max_length=time_series_length, window_size_min_window_size=self.window_size_min_window_size, @@ -207,7 +207,7 @@ def get_test_params(cls, parameter_set="default"): return params -def build_pipeline_auto( +def _build_pipeline_auto( time_series_min_length: int, time_series_max_length: int, window_size_min_window_size=4, @@ -225,7 +225,7 @@ def build_pipeline_auto( pipeline_objects: Optional[Sequence[tuple]] = None, complexity: Literal["quadratic", "linear"] = "quadratic", ): - configs = heuristic_function_sax( + configs = _heuristic_function_sax( time_series_min_length=time_series_min_length, time_series_max_length=time_series_max_length, window_size_min_window_size=window_size_min_window_size, @@ -240,7 +240,7 @@ def build_pipeline_auto( ) return ( - build_pipeline( + _build_pipeline( configs=configs, min_window_to_signal_std_ratio=min_window_to_signal_std_ratio, n_jobs=n_jobs, @@ -252,7 +252,7 @@ def build_pipeline_auto( ) -def build_pipeline( +def _build_pipeline( configs, min_window_to_signal_std_ratio: float = 0.0, n_jobs_numba=1, @@ -302,11 +302,11 @@ def __init__( self.min_window_to_signal_std_ratio = min_window_to_signal_std_ratio self.prefix = prefix self.n_jobs = n_jobs - self.n_words = convert_to_base_10( - array_to_int(np.full(self.word_length, self.alphabet_size - 1)) + 1, + self.n_words = _convert_to_base_10( + _array_to_int(np.full(self.word_length, self.alphabet_size - 1)) + 1, base=self.alphabet_size, ) - set_n_jobs_numba(n_jobs=self.n_jobs) + _set_n_jobs_numba(n_jobs=self.n_jobs) def fit(self, X, y=None): return self @@ -315,7 +315,7 @@ def transform(self, X, y=None): import sparse # FIXME: can we move this outside for better performance? shape_ = (len(X), len(X[0]), self.n_words) - out = transform_sax_patterns( + out = _transform_sax_patterns( panel=X, window_size=self.window_size, dilation=self.dilation, @@ -375,7 +375,7 @@ def transform(self, X): @nb.njit(fastmath=True, cache=True) -def convert_to_base_10(number, base): +def _convert_to_base_10(number, base): result = 0 multiplier = 1 @@ -389,14 +389,14 @@ def convert_to_base_10(number, base): @nb.njit(cache=True) -def array_to_int(arr): +def _array_to_int(arr): result = 0 for i in range(len(arr)): result = result * 10 + arr[i] return result -def set_n_jobs_numba(n_jobs): +def _set_n_jobs_numba(n_jobs): if n_jobs == -1: nb.set_num_threads(nb.config.NUMBA_DEFAULT_NUM_THREADS) else: @@ -404,7 +404,7 @@ def set_n_jobs_numba(n_jobs): @nb.njit(parallel=True, nogil=True, cache=True) -def transform_sax_patterns( +def _transform_sax_patterns( panel, window_size, word_length, @@ -413,21 +413,21 @@ def transform_sax_patterns( dilation, min_window_to_signal_std_ratio=0.0, ): - bins = get_norm_bins(alphabet_size=alphabet_size) + bins = _get_norm_bins(alphabet_size=alphabet_size) n_signals = len(panel[0]) n_ts = len(panel) iterations = n_ts * n_signals counts = np.zeros(iterations + 1, dtype=np.int64) for i in nb.prange(iterations): - ts_idx, signal_idx = ndindex_2d_array(i, n_signals) + ts_idx, signal_idx = _ndindex_2d_array(i, n_signals) signal = np.asarray(panel[ts_idx][signal_idx]) signal = signal[~np.isnan(signal)] - if not are_window_size_and_dilation_compatible_with_signal_length( + if not _are_window_size_and_dilation_compatible_with_signal_length( window_size, dilation, signal.size ): continue counts[i + 1] = len( - new_transform_single_conf( + _new_transform_single_conf( a=signal, ts_idx=ts_idx, signal_idx=signal_idx, @@ -445,14 +445,14 @@ def transform_sax_patterns( shape = (n_rows, 4) out = np.empty(shape, dtype=np.int64) for i in nb.prange(iterations): - ts_idx, signal_idx = ndindex_2d_array(i, n_signals) + ts_idx, signal_idx = _ndindex_2d_array(i, n_signals) signal = np.asarray(panel[ts_idx][signal_idx]) signal = signal[~np.isnan(signal)] - if not are_window_size_and_dilation_compatible_with_signal_length( + if not _are_window_size_and_dilation_compatible_with_signal_length( window_size, dilation, signal.size ): continue - out_ = new_transform_single_conf( + out_ = _new_transform_single_conf( a=signal, ts_idx=ts_idx, signal_idx=signal_idx, @@ -469,7 +469,7 @@ def transform_sax_patterns( @nb.njit(cache=True) -def new_transform_single_conf( +def _new_transform_single_conf( a, ts_idx, signal_idx, @@ -481,7 +481,7 @@ def new_transform_single_conf( stride=1, min_window_to_signal_std_ratio=0.0, ): - words, counts = new_transform_single( + words, counts = _new_transform_single( a=a, window_size=window_size, word_length=word_length, @@ -497,7 +497,7 @@ def new_transform_single_conf( @nb.njit(cache=True) -def new_transform_single( +def _new_transform_single( a, window_size, word_length, @@ -507,7 +507,7 @@ def new_transform_single( stride=1, min_window_to_signal_std_ratio=0.0, ): - sax_words = sax( + sax_words = _sax( a=a, window_size=window_size, word_length=word_length, @@ -516,24 +516,24 @@ def new_transform_single( dilation=dilation, stride=stride, ) - sax_words = sax_words_to_int(sax_words, alphabet_size) - return unique(sax_words) + sax_words = _sax_words_to_int(sax_words, alphabet_size) + return _unique(sax_words) @nb.njit(cache=True) -def ndindex_2d_array(idx, dim2_shape): +def _ndindex_2d_array(idx, dim2_shape): row_idx = idx // dim2_shape col_idx = idx % dim2_shape return row_idx, col_idx @nb.njit(cache=True) -def get_norm_bins(alphabet_size: int, mu=0, std=1): - return ppf(np.linspace(0, 1, alphabet_size + 1)[1:-1], mu, std) +def _get_norm_bins(alphabet_size: int, mu=0, std=1): + return _ppf(np.linspace(0, 1, alphabet_size + 1)[1:-1], mu, std) @nb.njit(fastmath=True, cache=True) -def erfinv(x: float) -> float: +def _erfinv(x: float) -> float: w = -math.log((1 - x) * (1 + x)) if w < 5: w = w - 2.5 @@ -561,12 +561,12 @@ def erfinv(x: float) -> float: @nb.vectorize(cache=True) -def ppf(x, mu=0, std=1): - return mu + math.sqrt(2) * erfinv(2 * x - 1) * std +def _ppf(x, mu=0, std=1): + return mu + math.sqrt(2) * _erfinv(2 * x - 1) * std @nb.njit(fastmath=True, cache=True) -def are_window_size_and_dilation_compatible_with_signal_length( +def _are_window_size_and_dilation_compatible_with_signal_length( window_size, dilation, signal_length ): if window_size + (window_size - 1) * (dilation - 1) <= signal_length: @@ -576,7 +576,7 @@ def are_window_size_and_dilation_compatible_with_signal_length( @nb.njit(cache=True) -def sax( +def _sax( a, window_size, word_length, @@ -585,37 +585,39 @@ def sax( dilation=1, min_window_to_signal_std_ratio=0.0, ): - n_windows = get_n_windows( + n_windows = _get_n_windows( sequence_size=a.size, window_size=window_size, dilation=dilation, stride=stride ) - n_windows_moving = get_n_windows( + n_windows_moving = _get_n_windows( sequence_size=a.size, window_size=window_size, dilation=dilation ) global_std = np.std(a) if global_std == 0: return np.zeros((n_windows, word_length), dtype=np.uint8) seg_size = window_size // word_length - n_windows = get_n_windows( + n_windows = _get_n_windows( sequence_size=a.size, window_size=window_size, dilation=dilation, stride=stride ) - n_segments = get_n_windows( + n_segments = _get_n_windows( sequence_size=a.size, window_size=seg_size, dilation=dilation ) segment_means = np.full(n_segments, np.nan) window_means = np.full(n_windows_moving, np.nan) window_stds = np.full(n_windows_moving, np.nan) for d in range(dilation): - window_means[d::dilation] = move_mean(a[d::dilation], window_size)[ + window_means[d::dilation] = _move_mean(a[d::dilation], window_size)[ window_size - 1 : ] - window_stds[d::dilation] = move_std(a[d::dilation], window_size)[ + window_stds[d::dilation] = _move_std(a[d::dilation], window_size)[ window_size - 1 : ] - segment_means[d::dilation] = move_mean(a[d::dilation], seg_size)[seg_size - 1 :] + segment_means[d::dilation] = _move_mean(a[d::dilation], seg_size)[ + seg_size - 1 : + ] out = np.zeros((n_windows, word_length)) for i in range(n_windows): for j in range(word_length): - out[i, j] = zscore_threshold( + out[i, j] = _zscore_threshold( a=segment_means[(i * stride) + (j * seg_size * dilation)], mu=window_means[i * stride], sigma=window_stds[i * stride], @@ -626,7 +628,7 @@ def sax( @nb.njit(fastmath=True, cache=True) -def get_n_windows(sequence_size, window_size, dilation=1, stride=1, padding=0): +def _get_n_windows(sequence_size, window_size, dilation=1, stride=1, padding=0): return 1 + math.floor( (sequence_size + 2 * padding - window_size - (dilation - 1) * (window_size - 1)) / stride @@ -634,25 +636,25 @@ def get_n_windows(sequence_size, window_size, dilation=1, stride=1, padding=0): @nb.njit(fastmath=True, cache=True) -def zscore_threshold( +def _zscore_threshold( a: float, mu: float, sigma: float, sigma_global: float, sigma_threshold: float ) -> float: if sigma_global == 0: return 0 if sigma / sigma_global < sigma_threshold: return 0 - return zscore(a=a, mu=mu, sigma=sigma) + return _zscore(a=a, mu=mu, sigma=sigma) @nb.njit(fastmath=True, cache=True) -def zscore(a: float, mu: float, sigma: float) -> float: +def _zscore(a: float, mu: float, sigma: float) -> float: if sigma == 0: return 0 return (a - mu) / sigma @nb.njit(fastmath=True, cache=True) -def move_mean(a, window_width): +def _move_mean(a, window_width): out = np.empty_like(a) asum = 0.0 count = 0 @@ -671,7 +673,7 @@ def move_mean(a, window_width): @nb.njit(fastmath=True, cache=True) -def move_std(a, window_width, ddof=0): +def _move_std(a, window_width, ddof=0): out = np.empty_like(a, dtype=np.float64) # Initial mean and variance calculation @@ -717,15 +719,15 @@ def move_std(a, window_width, ddof=0): @nb.njit(fastmath=True, cache=True) -def sax_words_to_int(arrays, base): +def _sax_words_to_int(arrays, base): out = np.empty(arrays.shape[0], dtype=np.int64) for i in range(arrays.shape[0]): - out[i] = array_to_int_new_base(arrays[i], base) + out[i] = _array_to_int_new_base(arrays[i], base) return out @nb.njit(fastmath=True, cache=True) -def array_to_int_new_base(array, base): +def _array_to_int_new_base(array, base): word_length = array.shape[0] result = 0 for i in range(0, word_length, 1): @@ -734,14 +736,14 @@ def array_to_int_new_base(array, base): @nb.njit(cache=True) -def length(a): +def _length(a): a = int(np.ceil(np.log2(a))) a = 2 << a return a @nb.njit(cache=True) -def hash_function(v): +def _hash_function(v): byte_mask = np.uint64(255) bs = np.uint64(v) @@ -777,8 +779,8 @@ def hash_function(v): @nb.njit(cache=True) -def make_hash_table(ar): - a = length(len(ar)) +def _make_hash_table(ar): + a = _length(len(ar)) mask = a - 1 uniques = np.empty(a, dtype=ar.dtype) @@ -787,7 +789,7 @@ def make_hash_table(ar): @nb.njit(cache=True) -def set_item(uniques, uniques_cnt, mask, h, v, total, miss_hits, weight): +def _set_item(uniques, uniques_cnt, mask, h, v, total, miss_hits, weight): index = h & mask while True: if uniques_cnt[index] == 0: @@ -807,7 +809,7 @@ def set_item(uniques, uniques_cnt, mask, h, v, total, miss_hits, weight): @nb.njit(cache=True) -def concrete(ar, uniques, uniques_cnt, a, total): +def _concrete(ar, uniques, uniques_cnt, a, total): # flush the results in a concrete array uniques_ = np.empty(total, dtype=ar.dtype) uniques_cnt_ = np.empty(total, dtype=np.int_) @@ -821,21 +823,21 @@ def concrete(ar, uniques, uniques_cnt, a, total): @nb.njit(cache=True) -def unique(ar): +def _unique(ar): # https://github.com/lhprojects/blog/blob/master/_posts/JupyterNotebooks/HashUnique.ipynb - uniques, uniques_cnt, l, mask = make_hash_table(ar) + uniques, uniques_cnt, l, mask = _make_hash_table(ar) total = 0 miss_hits = 0 for v in ar: - h = hash_function(v) - total, miss_hits = set_item( + h = _hash_function(v) + total, miss_hits = _set_item( uniques, uniques_cnt, mask, h, v, total, miss_hits, 1 ) - uniques_, uniques_cnt_ = concrete(ar, uniques, uniques_cnt, l, total) + uniques_, uniques_cnt_ = _concrete(ar, uniques, uniques_cnt, l, total) return uniques_, uniques_cnt_ -def get_borf_params( +def _get_borf_params( time_series_min_length, time_series_max_length, window_size_min_window_size=4, @@ -853,7 +855,7 @@ def get_borf_params( dilations_max_dilation=None, ): params = {} - params["window_sizes"] = get_window_sizes( + params["window_sizes"] = _get_window_sizes( m_min=time_series_min_length, m_max=time_series_max_length, min_window_size=window_size_min_window_size, @@ -861,23 +863,23 @@ def get_borf_params( power=window_size_power, ).tolist() - params["word_lengths"] = get_word_lengths( + params["word_lengths"] = _get_word_lengths( n_word_lengths=word_lengths_n_word_lengths, start=0, ).tolist() - params["dilations"] = get_dilations( + params["dilations"] = _get_dilations( max_length=time_series_max_length, min_dilation=dilations_min_dilation, max_dilation=dilations_max_dilation, ).tolist() - params["strides"] = get_strides(n_strides=strides_n_strides).tolist() - params["alphabet_sizes_slope"] = get_alphabet_sizes( + params["strides"] = _get_strides(n_strides=strides_n_strides).tolist() + params["alphabet_sizes_slope"] = _get_alphabet_sizes( min_symbols=alphabets_slope_min_symbols, max_symbols=alphabets_slope_max_symbols, step=alphabets_slope_step, ).tolist() - params["alphabet_sizes_mean"] = get_alphabet_sizes( + params["alphabet_sizes_mean"] = _get_alphabet_sizes( min_symbols=alphabets_mean_min_symbols, max_symbols=alphabets_mean_max_symbols, step=alphabets_mean_step, @@ -886,11 +888,11 @@ def get_borf_params( @nb.njit(cache=True) -def is_empty(a) -> bool: +def _is_empty(a) -> bool: return a.size == 0 -def get_window_sizes(m_min, m_max, min_window_size=4, max_window_size=None, power=2): +def _get_window_sizes(m_min, m_max, min_window_size=4, max_window_size=None, power=2): if max_window_size is None: max_window_size = m_max m = 2 @@ -904,17 +906,17 @@ def get_window_sizes(m_min, m_max, min_window_size=4, max_window_size=None, powe m = int(m * power) windows = np.array(windows) windows_min = np.array(windows_min[1:]) - if not is_empty(windows_min): + if not _is_empty(windows_min): if m_min <= windows_min.max() * power: windows = np.concatenate([windows_min, windows]) return windows.astype(int) -def get_word_lengths(n_word_lengths=4, start=0): +def _get_word_lengths(n_word_lengths=4, start=0): return np.array([2**i for i in range(start, n_word_lengths + start)]) -def get_dilations(max_length, min_dilation=1, max_dilation=None): +def _get_dilations(max_length, min_dilation=1, max_dilation=None): dilations = list() max_length_log2 = np.log2(max_length) if max_dilation is None: @@ -926,15 +928,15 @@ def get_dilations(max_length, min_dilation=1, max_dilation=None): return np.array(dilations) -def get_strides(n_strides=1): +def _get_strides(n_strides=1): return np.arange(1, n_strides + 1) -def get_alphabet_sizes(min_symbols=3, max_symbols=4, step=1): +def _get_alphabet_sizes(min_symbols=3, max_symbols=4, step=1): return np.arange(min_symbols, max_symbols, step) -def generate_sax_parameters_configurations( +def _generate_sax_parameters_configurations( window_sizes, strides, dilations, @@ -963,7 +965,7 @@ def generate_sax_parameters_configurations( word_length, alphabet_mean, alphabet_slope, - ) = extract_parameters_from_args(parameter) + ) = _extract_parameters_from_args(parameter) if word_length > window_size: # word_length cannot be greater than window_size continue if ( @@ -984,7 +986,7 @@ def generate_sax_parameters_configurations( @nb.njit(cache=True) -def is_valid_windowing(sequence_size: int, window_size: int, dilation: int) -> bool: +def _is_valid_windowing(sequence_size: int, window_size: int, dilation: int) -> bool: if ( sequence_size < window_size * dilation ): # if window_size * dilation exceeds the length of the sequence @@ -993,7 +995,7 @@ def is_valid_windowing(sequence_size: int, window_size: int, dilation: int) -> b return True -def clean_sax_parameters_configurations(parameters, max_length): +def _clean_sax_parameters_configurations(parameters, max_length): cleaned_parameters = list() for parameter in parameters: window_size = parameter["window_size"] @@ -1002,7 +1004,7 @@ def clean_sax_parameters_configurations(parameters, max_length): alphabet_mean = parameter["alphabet_size_mean"] alphabet_slope = parameter["alphabet_size_slope"] stride = parameter["stride"] - if not is_valid_windowing( + if not _is_valid_windowing( window_size=window_size, sequence_size=max_length, dilation=dilation ): continue @@ -1021,7 +1023,7 @@ def clean_sax_parameters_configurations(parameters, max_length): return cleaned_parameters -def sax_parameters_configurations_linear_strides(parameters): +def _sax_parameters_configurations_linear_strides(parameters): new_parameters = list() for parameter in parameters: window_size = parameter["window_size"] @@ -1041,7 +1043,7 @@ def sax_parameters_configurations_linear_strides(parameters): return new_parameters -def extract_parameters_from_args(parameter): +def _extract_parameters_from_args(parameter): window_size = parameter[0] stride = parameter[1] dilation = parameter[2] @@ -1051,7 +1053,7 @@ def extract_parameters_from_args(parameter): return window_size, stride, dilation, word_length, alphabet_mean, alphabet_slope -def heuristic_function_1dsax( +def _heuristic_function_1dsax( time_series_min_length, time_series_max_length, window_size_min_window_size=4, @@ -1069,7 +1071,7 @@ def heuristic_function_1dsax( dilations_max_dilation=None, complexity: Literal["quadratic", "linear"] = "quadratic", ): - params = get_borf_params( + params = _get_borf_params( time_series_min_length=time_series_min_length, time_series_max_length=time_series_max_length, window_size_min_window_size=window_size_min_window_size, @@ -1087,7 +1089,7 @@ def heuristic_function_1dsax( dilations_max_dilation=dilations_max_dilation, ) - params_list = generate_sax_parameters_configurations( + params_list = _generate_sax_parameters_configurations( window_sizes=params["window_sizes"], strides=params["strides"], dilations=params["dilations"], @@ -1096,19 +1098,19 @@ def heuristic_function_1dsax( alphabet_sizes_slope=params["alphabet_sizes_slope"], ) - cleaned_params_list = clean_sax_parameters_configurations( + cleaned_params_list = _clean_sax_parameters_configurations( parameters=params_list, max_length=time_series_max_length ) if complexity == "linear": - cleaned_params_list = sax_parameters_configurations_linear_strides( + cleaned_params_list = _sax_parameters_configurations_linear_strides( parameters=cleaned_params_list ) return cleaned_params_list -def heuristic_function_sax( +def _heuristic_function_sax( time_series_min_length, time_series_max_length, window_size_min_window_size=4, @@ -1123,7 +1125,7 @@ def heuristic_function_sax( dilations_max_dilation=None, complexity: Literal["quadratic", "linear"] = "quadratic", ): - configs = heuristic_function_1dsax( + configs = _heuristic_function_1dsax( time_series_min_length=time_series_min_length, time_series_max_length=time_series_max_length, window_size_min_window_size=window_size_min_window_size, From 9331de2fa7b9da2798011a35afb361f89448054a Mon Sep 17 00:00:00 2001 From: francesco Date: Tue, 15 Oct 2024 11:16:50 +0200 Subject: [PATCH 11/19] Add the option for densifying the BORF output --- .../collection/dictionary_based/_borf.py | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_borf.py b/aeon/transformations/collection/dictionary_based/_borf.py index 75832729dc..42038e3842 100644 --- a/aeon/transformations/collection/dictionary_based/_borf.py +++ b/aeon/transformations/collection/dictionary_based/_borf.py @@ -65,6 +65,9 @@ class BORF(BaseCollectionTransformer): The computational complexity mode: - `'quadratic'`: Higher accuracy with more computational cost. - `'linear'`: Faster computations with potentially lower accuracy. + densify : bool, default=False + If True, converts the output to a dense array. + By default, the output is returned as a scipy sparse matrix. Attributes @@ -122,6 +125,7 @@ def __init__( n_jobs_numba=1, transformer_weights=None, complexity: Literal["quadratic", "linear"] = "quadratic", + densify=False, ): self.window_size_min_window_size = window_size_min_window_size self.window_size_max_window_size = window_size_max_window_size @@ -136,12 +140,22 @@ def __init__( self.n_jobs_numba = n_jobs_numba self.transformer_weights = transformer_weights self.complexity = complexity + self.densify = densify super().__init__() def _fit(self, X, y=None): time_series_length = X.shape[2] # for better computation time, this should be moved to the init, # setting time_series_length as a user parameter + + pipeline_objects = [ + (ReshapeTo2D, dict()), + (ZeroColumnsRemover, dict()), + (ToScipySparse, dict()), + ] + if self.densify: + pipeline_objects.append((ToDense, dict())) + self.pipe_, self.configs_ = _build_pipeline_auto( time_series_min_length=time_series_length, time_series_max_length=time_series_length, @@ -157,11 +171,7 @@ def _fit(self, X, y=None): n_jobs=self.n_jobs, n_jobs_numba=self.n_jobs_numba, transformer_weights=self.transformer_weights, - pipeline_objects=[ - (ReshapeTo2D, dict()), - (ZeroColumnsRemover, dict()), - (ToScipySparse, dict()), - ], + pipeline_objects=pipeline_objects, complexity=self.complexity, ) self.pipe_.fit(X) @@ -374,6 +384,17 @@ def transform(self, X): return X.to_scipy_sparse() +class ToDense(BaseEstimator, TransformerMixin): + def __init__(self): + pass + + def fit(self, X, y=None): + return self + + def transform(self, X): + return X.toarray() + + @nb.njit(fastmath=True, cache=True) def _convert_to_base_10(number, base): result = 0 From 59f0ff87c664de4495860592b3cab2f83bb189d7 Mon Sep 17 00:00:00 2001 From: francesco Date: Tue, 15 Oct 2024 11:20:34 +0200 Subject: [PATCH 12/19] Make most public classes private --- .../collection/dictionary_based/_borf.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_borf.py b/aeon/transformations/collection/dictionary_based/_borf.py index 42038e3842..1549f5eea9 100644 --- a/aeon/transformations/collection/dictionary_based/_borf.py +++ b/aeon/transformations/collection/dictionary_based/_borf.py @@ -149,12 +149,12 @@ def _fit(self, X, y=None): # setting time_series_length as a user parameter pipeline_objects = [ - (ReshapeTo2D, dict()), - (ZeroColumnsRemover, dict()), - (ToScipySparse, dict()), + (_ReshapeTo2D, dict()), + (_ZeroColumnsRemover, dict()), + (_ToScipySparse, dict()), ] if self.densify: - pipeline_objects.append((ToDense, dict())) + pipeline_objects.append((_ToDense, dict())) self.pipe_, self.configs_ = _build_pipeline_auto( time_series_min_length=time_series_length, @@ -338,7 +338,7 @@ def transform(self, X, y=None): return sparse.COO(coords=out[:, :3].T, data=out[:, -1].T, shape=shape_) -class ZeroColumnsRemover(BaseEstimator, TransformerMixin): +class _ZeroColumnsRemover(BaseEstimator, TransformerMixin): def __init__(self, axis=0): self.axis = axis @@ -354,7 +354,7 @@ def transform(self, X): return X[..., self.columns_to_keep_] -class ReshapeTo2D(BaseEstimator, TransformerMixin): +class _ReshapeTo2D(BaseEstimator, TransformerMixin): def __init__(self, keep_unraveled_index=False): self.keep_unraveled_index = keep_unraveled_index # shape: (n_flattened_features, 2) -> flattened index -> (dimension, word) @@ -373,7 +373,7 @@ def transform(self, X): return X.reshape((X.shape[0], -1)) -class ToScipySparse(BaseEstimator, TransformerMixin): +class _ToScipySparse(BaseEstimator, TransformerMixin): def __init__(self): pass @@ -384,7 +384,7 @@ def transform(self, X): return X.to_scipy_sparse() -class ToDense(BaseEstimator, TransformerMixin): +class _ToDense(BaseEstimator, TransformerMixin): def __init__(self): pass From a940180d14ad2fe2088537243407da3159f81b80 Mon Sep 17 00:00:00 2001 From: francesco Date: Tue, 15 Oct 2024 11:25:21 +0200 Subject: [PATCH 13/19] Update get_test_params with densify option --- .../collection/dictionary_based/_borf.py | 49 +++++++++++++------ 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_borf.py b/aeon/transformations/collection/dictionary_based/_borf.py index 1549f5eea9..477862a2df 100644 --- a/aeon/transformations/collection/dictionary_based/_borf.py +++ b/aeon/transformations/collection/dictionary_based/_borf.py @@ -199,21 +199,40 @@ def get_test_params(cls, parameter_set="default"): instance. `create_test_instance` uses the first (or only) dictionary in `params`. """ - params = { - "window_size_min_window_size": 4, - "window_size_max_window_size": None, - "word_lengths_n_word_lengths": 4, - "alphabets_min_symbols": 3, - "alphabets_max_symbols": 4, - "alphabets_step": 1, - "dilations_min_dilation": 1, - "dilations_max_dilation": None, - "min_window_to_signal_std_ratio": 0.0, - "n_jobs": 1, - "n_jobs_numba": 1, - "transformer_weights": None, - "complexity": "quadratic", - } + params = [ + { + "window_size_min_window_size": 4, + "window_size_max_window_size": None, + "word_lengths_n_word_lengths": 4, + "alphabets_min_symbols": 3, + "alphabets_max_symbols": 4, + "alphabets_step": 1, + "dilations_min_dilation": 1, + "dilations_max_dilation": None, + "min_window_to_signal_std_ratio": 0.0, + "n_jobs": 1, + "n_jobs_numba": 1, + "transformer_weights": None, + "complexity": "quadratic", + "densify": False, + }, + { + "window_size_min_window_size": 4, + "window_size_max_window_size": None, + "word_lengths_n_word_lengths": 4, + "alphabets_min_symbols": 3, + "alphabets_max_symbols": 4, + "alphabets_step": 1, + "dilations_min_dilation": 1, + "dilations_max_dilation": None, + "min_window_to_signal_std_ratio": 0.0, + "n_jobs": 1, + "n_jobs_numba": 1, + "transformer_weights": None, + "complexity": "quadratic", + "densify": True, + }, + ] return params From 49cef20c142a9790f6de3c29544af1c2f697095f Mon Sep 17 00:00:00 2001 From: francesco Date: Thu, 17 Oct 2024 09:53:22 +0200 Subject: [PATCH 14/19] Remove outdated comments --- aeon/transformations/collection/dictionary_based/_borf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_borf.py b/aeon/transformations/collection/dictionary_based/_borf.py index 477862a2df..c25a4b4f80 100644 --- a/aeon/transformations/collection/dictionary_based/_borf.py +++ b/aeon/transformations/collection/dictionary_based/_borf.py @@ -1048,8 +1048,6 @@ def _clean_sax_parameters_configurations(parameters, max_length): window_size=window_size, sequence_size=max_length, dilation=dilation ): continue - # if window_size == word_length: - # continue # FIXME: should I do this? cleaned_parameters.append( dict( window_size=window_size, From aa4cb8fee6f48f6b002012985572e1a0cdfecc77 Mon Sep 17 00:00:00 2001 From: francesco Date: Fri, 18 Oct 2024 16:04:06 +0200 Subject: [PATCH 15/19] Resolve minor conflicts --- .all-contributorsrc | 9 +++++++++ docs/api_reference/transformations.rst | 1 + pyproject.toml | 1 + 3 files changed, 11 insertions(+) diff --git a/.all-contributorsrc b/.all-contributorsrc index 7118f6ebbc..4aabbcb4af 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -2583,6 +2583,15 @@ "design", "doc" ] + }, + { + "login": "fspinna", + "name": "Francesco Spinnato", + "avatar_url": "https://avatars.githubusercontent.com/u/35352023?v=4", + "profile": "https://github.com/fspinna", + "contributions": [ + "code" + ] } ], "commitType": "docs" diff --git a/docs/api_reference/transformations.rst b/docs/api_reference/transformations.rst index d471923f5c..06f641a3a9 100644 --- a/docs/api_reference/transformations.rst +++ b/docs/api_reference/transformations.rst @@ -101,6 +101,7 @@ Dictionary-based features PAA SFA SFAFast + BORF Feature based diff --git a/pyproject.toml b/pyproject.toml index aeb02e3cde..9dfa087c32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ all_extras = [ "torch>=1.13.1", "tsfresh>=0.20.0", "tslearn>=0.5.2", + "sparse" ] dl = [ "keras-self-attention", From 97dcd63518a7a4461de1a9812c7252dda8b152f1 Mon Sep 17 00:00:00 2001 From: francesco Date: Fri, 18 Oct 2024 17:25:50 +0200 Subject: [PATCH 16/19] fix doctest --- aeon/transformations/collection/dictionary_based/_borf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aeon/transformations/collection/dictionary_based/_borf.py b/aeon/transformations/collection/dictionary_based/_borf.py index c25a4b4f80..d3a35d4aff 100644 --- a/aeon/transformations/collection/dictionary_based/_borf.py +++ b/aeon/transformations/collection/dictionary_based/_borf.py @@ -88,7 +88,7 @@ class BORF(BaseCollectionTransformer): -------- >>> from aeon.transformations.collection.dictionary_based import BORF >>> from aeon.datasets import load_unit_test - >>> X, _ = load_unit_test(return_X_y=True) + >>> X, _ = load_unit_test() >>> borf = BORF() # doctest: +SKIP >>> borf.fit(X) # doctest: +SKIP BORF() From a8689e42b89fe824bc5aa76e1663218ba1d0f142 Mon Sep 17 00:00:00 2001 From: francesco Date: Fri, 18 Oct 2024 18:04:49 +0200 Subject: [PATCH 17/19] fix tags --- aeon/transformations/collection/dictionary_based/_borf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_borf.py b/aeon/transformations/collection/dictionary_based/_borf.py index d3a35d4aff..989ea607fa 100644 --- a/aeon/transformations/collection/dictionary_based/_borf.py +++ b/aeon/transformations/collection/dictionary_based/_borf.py @@ -99,13 +99,12 @@ class BORF(BaseCollectionTransformer): _tags = { "X_inner_type": "numpy3D", "capability:inverse_transform": False, - "capability:missing_values:removes": True, + "capability:missing_values": True, "capability:multivariate": True, "capability:multithreading": True, "input_data_type": "Collection", "algorithm_type": "dictionary", "output_data_type": "Tabular", - "instancewise": False, "requires_y": False, "python_dependencies": "sparse", } From a726bc66d6b56716f63873074083d6a420ae8f9f Mon Sep 17 00:00:00 2001 From: francesco Date: Mon, 21 Oct 2024 11:03:50 +0200 Subject: [PATCH 18/19] add cant_pickle and non_deterministic tags for compatibility issues with csr matrices, add n_feature_names_in_ and n_features_in_ attribute for sklearn compatibility --- .../collection/dictionary_based/_borf.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_borf.py b/aeon/transformations/collection/dictionary_based/_borf.py index 989ea607fa..836d5ce5a8 100644 --- a/aeon/transformations/collection/dictionary_based/_borf.py +++ b/aeon/transformations/collection/dictionary_based/_borf.py @@ -107,6 +107,8 @@ class BORF(BaseCollectionTransformer): "output_data_type": "Tabular", "requires_y": False, "python_dependencies": "sparse", + "cant_pickle": True, + "non_deterministic": True, } def __init__( @@ -363,6 +365,9 @@ def __init__(self, axis=0): self.n_original_columns_ = None self.columns_to_keep_ = None + self.feature_names_in_ = None + self.n_features_in_ = None + def fit(self, X, y=None): self.n_original_columns_ = X.shape[1] self.columns_to_keep_ = np.argwhere(X.any(axis=self.axis)).ravel() @@ -379,6 +384,9 @@ def __init__(self, keep_unraveled_index=False): self.unraveled_index_ = None self.original_shape_ = None + self.feature_names_in_ = None + self.n_features_in_ = None + def fit(self, X, y=None): self.original_shape_ = X.shape if self.keep_unraveled_index: @@ -393,7 +401,8 @@ def transform(self, X): class _ToScipySparse(BaseEstimator, TransformerMixin): def __init__(self): - pass + self.feature_names_in_ = None + self.n_features_in_ = None def fit(self, X, y=None): return self @@ -404,7 +413,8 @@ def transform(self, X): class _ToDense(BaseEstimator, TransformerMixin): def __init__(self): - pass + self.feature_names_in_ = None + self.n_features_in_ = None def fit(self, X, y=None): return self From 7c6e6dbbae4ce844010c43c39aa67a6121ec1403 Mon Sep 17 00:00:00 2001 From: francesco Date: Mon, 21 Oct 2024 11:12:47 +0200 Subject: [PATCH 19/19] add n_feature_names_in_ and n_features_in_ attributes to IndividualBORF for sklearn compatibility --- aeon/transformations/collection/dictionary_based/_borf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/aeon/transformations/collection/dictionary_based/_borf.py b/aeon/transformations/collection/dictionary_based/_borf.py index 836d5ce5a8..f1af5bf944 100644 --- a/aeon/transformations/collection/dictionary_based/_borf.py +++ b/aeon/transformations/collection/dictionary_based/_borf.py @@ -338,6 +338,9 @@ def __init__( ) _set_n_jobs_numba(n_jobs=self.n_jobs) + self.feature_names_in_ = None + self.n_features_in_ = None + def fit(self, X, y=None): return self