From ed8a9b796b2577c560208e998b3870f6690ef171 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 2 Apr 2020 08:53:40 +0300 Subject: [PATCH 01/61] GordoTimeseriesGenerator first version --- gordo/machine/model/models.py | 91 ++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 27 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 7e82082b9..b9252dc21 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -4,7 +4,7 @@ import logging import io from pprint import pprint -from typing import Union, Callable, Dict, Any, Optional +from typing import Union, Callable, Dict, Any, Optional, Tuple from abc import ABCMeta import h5py @@ -439,10 +439,8 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": """ - X = X.values if isinstance(X, pd.DataFrame) else X - y = y.values if isinstance(y, pd.DataFrame) else y - - X = self._validate_and_fix_size_of_X(X) + if not isinstance(X, pd.DataFrame): + X = self._validate_and_fix_size_of_X(X) # We call super.fit on a single sample (notice the batch_size=1) to initiate the # model using the scikit-learn wrapper. @@ -574,13 +572,34 @@ def lookahead(self) -> int: return 0 +def pad_x_and_y( + X: np.ndarray, y: np.ndarray, lookahead: int +) -> Tuple[np.ndarray, np.ndarray]: + new_length = len(X) + 1 - lookahead + if lookahead == 1: + return X, y + elif lookahead >= 0: + pad_kw = dict(maxlen=new_length, dtype=X.dtype) + + if lookahead == 0: + X = pad_sequences([X], padding="post", **pad_kw)[0] + y = pad_sequences([y], padding="pre", **pad_kw)[0] + + elif lookahead > 1: + X = pad_sequences([X], padding="post", truncating="post", **pad_kw)[0] + y = pad_sequences([y], padding="pre", truncating="pre", **pad_kw)[0] + return X, y + else: + raise ValueError(f"Value of `lookahead` can not be negative, is {lookahead}") + + def create_keras_timeseriesgenerator( - X: np.ndarray, - y: Optional[np.ndarray], + X: Union[pd.DataFrame, np.ndarray], + y: Optional[Union[pd.DataFrame, np.ndarray]], batch_size: int, lookback_window: int, lookahead: int, -) -> tensorflow.keras.preprocessing.sequence.TimeseriesGenerator: +) -> object: """ Provides a `keras.preprocessing.sequence.TimeseriesGenerator` for use with LSTM's, but with the added ability to specify the lookahead of the target in y. @@ -632,27 +651,45 @@ def create_keras_timeseriesgenerator( >>> len(gen[0][0][0][0]) # n_features = 2 2 """ - new_length = len(X) + 1 - lookahead - kwargs: Dict[str, Any] = dict(length=lookback_window, batch_size=batch_size) - if lookahead == 1: - kwargs.update(dict(data=X, targets=y)) - elif lookahead >= 0: + if isinstance(X, pd.DataFrame): + if not isinstance(y, pd.DataFrame): + raise ValueError("'y' should be instance of pandas.DataFrame") + return GordoTimeseriesGenerator( + data=X, targets=y, length=lookback_window, batch_size=batch_size + ) + else: + X, y = pad_x_and_y(X, y, lookahead) + return TimeseriesGenerator( + data=X, targets=y, length=lookback_window, batch_size=batch_size + ) - pad_kw = dict(maxlen=new_length, dtype=X.dtype) - if lookahead == 0: - kwargs["data"] = pad_sequences([X], padding="post", **pad_kw)[0] - kwargs["targets"] = pad_sequences([y], padding="pre", **pad_kw)[0] +class GordoTimeseriesGenerator(object): + def __init__( + self, data: pd.DataFrame, targets: pd.DataFrame, length: int, batch_size=128 + ): - elif lookahead > 1: - kwargs["data"] = pad_sequences( - [X], padding="post", truncating="post", **pad_kw - )[0] - kwargs["targets"] = pad_sequences( - [y], padding="pre", truncating="pre", **pad_kw - )[0] - else: - raise ValueError(f"Value of `lookahead` can not be negative, is {lookahead}") + if len(data) != len(targets): + raise ValueError( + "Data and targets have to be" + " of same length. " + "Data length is {}".format(len(data)) + + " while target length is {}".format(len(targets)) + ) + + self.data = data + self.targets = targets + self.length = length + self.batch_size = batch_size + + def __len__(self): + return (len(self.data) - 1 + self.batch_size) // self.batch_size + + def __getitem__(self, index): + i = self.batch_size * index + rows = np.arange(i, min(i + self.batch_size, len(self.data)), 1) + + samples = np.array([self.data[row - self.length : row : 1] for row in rows]) + targets = np.array([self.targets[row] for row in rows]) - return TimeseriesGenerator(**kwargs) + return samples, targets From af53b763dda97a239c23684fbbab45b62aa10b60 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 2 Apr 2020 11:01:50 +0300 Subject: [PATCH 02/61] First approach for GordoTimeseriesGenerator --- gordo/machine/model/models.py | 60 +++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index b9252dc21..b46d7c816 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -6,6 +6,7 @@ from pprint import pprint from typing import Union, Callable, Dict, Any, Optional, Tuple from abc import ABCMeta +from math import ceil import h5py import tensorflow.keras.models @@ -654,7 +655,7 @@ def create_keras_timeseriesgenerator( if isinstance(X, pd.DataFrame): if not isinstance(y, pd.DataFrame): - raise ValueError("'y' should be instance of pandas.DataFrame") + raise ValueError("'y' should be an instance of pandas.DataFrame") return GordoTimeseriesGenerator( data=X, targets=y, length=lookback_window, batch_size=batch_size ) @@ -667,7 +668,12 @@ def create_keras_timeseriesgenerator( class GordoTimeseriesGenerator(object): def __init__( - self, data: pd.DataFrame, targets: pd.DataFrame, length: int, batch_size=128 + self, + data: pd.DataFrame, + targets: pd.DataFrame, + length: int, + batch_size=128, + step: Optional[pd.Timedelta] = None, ): if len(data) != len(targets): @@ -681,15 +687,55 @@ def __init__( self.targets = targets self.length = length self.batch_size = batch_size + if step is None: + step = pd.Timedelta(minutes=10) + self.step = step + self.time_batch_size = step * batch_size def __len__(self): return (len(self.data) - 1 + self.batch_size) // self.batch_size + def split_consecutive( + self, df: pd.DataFrame + ) -> Tuple[pd.DataFrame, Optional[pd.Timestamp]]: + prev_date = None + start_date = None + for dt in df.index: + if prev_date is None: + prev_date = dt + start_date = dt + else: + if dt - prev_date != self.step: + return df.loc[start_date:prev_date], dt + prev_date = dt + return df, None + def __getitem__(self, index): i = self.batch_size * index - rows = np.arange(i, min(i + self.batch_size, len(self.data)), 1) - - samples = np.array([self.data[row - self.length : row : 1] for row in rows]) - targets = np.array([self.targets[row] for row in rows]) - return samples, targets + index = self.data.index + + samples = [] + current_date = index.min() + while True: + batch = self.data[current_date : current_date + self.time_batch_size] + if batch.empty: + break + if len(batch) == self.batch_size: + samples.append(batch.values) + current_date += self.step + else: + batch, last_date = self.split_consecutive(batch) + batch_values = batch.values + if last_date is not None: + current_date = last_date + batch_values = pad_sequences( + [batch_values], padding="post", truncating="post", maxlen=batch + )[0] + else: + current_date += self.step + samples.append(batch_values) + + targets = np.array([self.targets[row] for row in range(len(samples))]) + + return np.array(samples), np.array(targets) From 1daba7ef82ca3796c99b6a1c0340e6b00726fd14 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 2 Apr 2020 11:14:59 +0300 Subject: [PATCH 03/61] Always pass pd.DataFrame for create_keras_timeseriesgenerator() --- gordo/machine/model/models.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index b46d7c816..6a9481a60 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -442,6 +442,8 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": if not isinstance(X, pd.DataFrame): X = self._validate_and_fix_size_of_X(X) + else: + pass #TODO # We call super.fit on a single sample (notice the batch_size=1) to initiate the # model using the scikit-learn wrapper. @@ -513,9 +515,11 @@ def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: >>> model_transform.shape (2, 2) """ - X = X.values if isinstance(X, pd.DataFrame) else X + if not isinstance(X, pd.DataFrame): + X = self._validate_and_fix_size_of_X(X) + else: + pass #TODO - X = self._validate_and_fix_size_of_X(X) tsg = create_keras_timeseriesgenerator( X=X, y=X, @@ -656,6 +660,7 @@ def create_keras_timeseriesgenerator( if isinstance(X, pd.DataFrame): if not isinstance(y, pd.DataFrame): raise ValueError("'y' should be an instance of pandas.DataFrame") + #TODO padding for X and y return GordoTimeseriesGenerator( data=X, targets=y, length=lookback_window, batch_size=batch_size ) From 5de4d4f954f4ed27c85292e1b42d202fc0a3903a Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 2 Apr 2020 11:35:42 +0300 Subject: [PATCH 04/61] Fix targets in GordoTimeseriesGenerator --- gordo/machine/model/models.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 6a9481a60..522453925 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -443,7 +443,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": if not isinstance(X, pd.DataFrame): X = self._validate_and_fix_size_of_X(X) else: - pass #TODO + pass # TODO # We call super.fit on a single sample (notice the batch_size=1) to initiate the # model using the scikit-learn wrapper. @@ -518,7 +518,7 @@ def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: if not isinstance(X, pd.DataFrame): X = self._validate_and_fix_size_of_X(X) else: - pass #TODO + pass # TODO tsg = create_keras_timeseriesgenerator( X=X, @@ -660,7 +660,7 @@ def create_keras_timeseriesgenerator( if isinstance(X, pd.DataFrame): if not isinstance(y, pd.DataFrame): raise ValueError("'y' should be an instance of pandas.DataFrame") - #TODO padding for X and y + # TODO padding for X and y return GordoTimeseriesGenerator( data=X, targets=y, length=lookback_window, batch_size=batch_size ) @@ -677,7 +677,7 @@ def __init__( data: pd.DataFrame, targets: pd.DataFrame, length: int, - batch_size=128, + batch_size: int = 128, step: Optional[pd.Timedelta] = None, ): @@ -716,16 +716,17 @@ def split_consecutive( return df, None def __getitem__(self, index): - i = self.batch_size * index - - index = self.data.index + data = self.data + index = data.index samples = [] + rows = [] current_date = index.min() while True: - batch = self.data[current_date : current_date + self.time_batch_size] + batch = data.loc[current_date : current_date + self.time_batch_size] if batch.empty: break + rows.append(index.get_loc(current_date)) if len(batch) == self.batch_size: samples.append(batch.values) current_date += self.step @@ -741,6 +742,6 @@ def __getitem__(self, index): current_date += self.step samples.append(batch_values) - targets = np.array([self.targets[row] for row in range(len(samples))]) + targets = np.array([self.targets[row] for row in rows]) return np.array(samples), np.array(targets) From 309b28670bc9dc1cabfe40d0d89ec435073264df Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 2 Apr 2020 12:06:53 +0300 Subject: [PATCH 05/61] Gix github checks --- gordo/machine/model/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 522453925..344a254fb 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -6,7 +6,6 @@ from pprint import pprint from typing import Union, Callable, Dict, Any, Optional, Tuple from abc import ABCMeta -from math import ceil import h5py import tensorflow.keras.models @@ -604,7 +603,7 @@ def create_keras_timeseriesgenerator( batch_size: int, lookback_window: int, lookahead: int, -) -> object: +) -> TimeseriesGenerator: """ Provides a `keras.preprocessing.sequence.TimeseriesGenerator` for use with LSTM's, but with the added ability to specify the lookahead of the target in y. From 3a0ceada0c39b53ff9b71b1f74c20f5421ff6b2e Mon Sep 17 00:00:00 2001 From: Serhii Date: Tue, 7 Apr 2020 14:25:10 +0300 Subject: [PATCH 06/61] Do GordoTimeseriesGenerator in different way --- gordo/machine/model/models.py | 139 +++++++++++++++++++++------------- 1 file changed, 87 insertions(+), 52 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 344a254fb..e27eb21f9 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -4,14 +4,16 @@ import logging import io from pprint import pprint -from typing import Union, Callable, Dict, Any, Optional, Tuple +from typing import Union, Callable, Dict, Any, Optional, Tuple, List from abc import ABCMeta +from dataclasses import dataclass import h5py import tensorflow.keras.models from tensorflow.keras.models import load_model, save_model from tensorflow.keras.preprocessing.sequence import pad_sequences, TimeseriesGenerator from tensorflow.keras.wrappers.scikit_learn import KerasRegressor as BaseWrapper +from tensorflow.python.keras.utils import data_utils import numpy as np import pandas as pd @@ -670,77 +672,110 @@ def create_keras_timeseriesgenerator( ) -class GordoTimeseriesGenerator(object): +@dataclass +class TimeseriesChunk: + start_ts: pd.Timestamp + end_ts: pd.Timestamp + size: int + + +@dataclass +class TimeseriesGeneratorContainer: + generator: TimeseriesGenerator + chunk: TimeseriesChunk + length: int + + +class GordoTimeseriesGenerator(data_utils.Sequence): def __init__( self, data: pd.DataFrame, targets: pd.DataFrame, length: int, batch_size: int = 128, - step: Optional[pd.Timedelta] = None, + shuffle: bool = False, + step: Optional[Union[pd.Timedelta, int]] = None, ): if len(data) != len(targets): raise ValueError( - "Data and targets have to be" + " of same length. " - "Data length is {}".format(len(data)) - + " while target length is {}".format(len(targets)) + "Data and targets have to be of same length. " + f"Data length is {len(data)}" + f" while target length is {len(targets)}" ) - self.data = data - self.targets = targets - self.length = length - self.batch_size = batch_size if step is None: step = pd.Timedelta(minutes=10) + if isinstance(step, int): + step = pd.Timedelta(minutes=step) self.step = step - self.time_batch_size = step * batch_size + self.consecutive_chunks = self.find_consecutive_chunks(data) + self.failed_chunks = [] + self.generators_containers = self.create_generator_containers( + data, targets, length=length, batch_size=batch_size, shuffle=shuffle + ) + + def filter_chunks(self, indexes=None): + if indexes is not None: + self.generators_containers = [ + self.generators_containers[i] for i in indexes + ] def __len__(self): - return (len(self.data) - 1 + self.batch_size) // self.batch_size + return sum(container.length for container in self.generators_containers) - def split_consecutive( - self, df: pd.DataFrame - ) -> Tuple[pd.DataFrame, Optional[pd.Timestamp]]: - prev_date = None - start_date = None + def find_consecutive_chunks(self, df: pd.DataFrame) -> List[TimeseriesChunk]: + chunks = [] + prev_ts, start_ts, size = None, None, 0 for dt in df.index: - if prev_date is None: - prev_date = dt - start_date = dt + if prev_ts is None: + prev_ts = dt + start_ts = dt else: - if dt - prev_date != self.step: - return df.loc[start_date:prev_date], dt - prev_date = dt - return df, None - - def __getitem__(self, index): - data = self.data - index = data.index - - samples = [] - rows = [] - current_date = index.min() - while True: - batch = data.loc[current_date : current_date + self.time_batch_size] - if batch.empty: - break - rows.append(index.get_loc(current_date)) - if len(batch) == self.batch_size: - samples.append(batch.values) - current_date += self.step - else: - batch, last_date = self.split_consecutive(batch) - batch_values = batch.values - if last_date is not None: - current_date = last_date - batch_values = pad_sequences( - [batch_values], padding="post", truncating="post", maxlen=batch - )[0] + if dt - prev_ts == self.step: + size += 1 + prev_ts = dt else: - current_date += self.step - samples.append(batch_values) + chunks.append(TimeseriesChunk(start_ts, prev_ts, size)) + prev_ts, start_ts, size = None, None, 0 + if start_ts is not None: + chunks.append(TimeseriesChunk(start_ts, prev_ts, size)) + return chunks - targets = np.array([self.targets[row] for row in rows]) + def create_generator_containers( + self, + data: pd.DataFrame, + targets: pd.DataFrame, + length: int, + batch_size: int, + shuffle: bool, + ) -> List[TimeseriesGeneratorContainer]: + generator_containers = [] + for chunk in self.consecutive_chunks: + gen_data = data[chunk.start_ts : chunk.end_ts].values + gen_target = targets[chunk.start_ts : chunk.end_ts].values + try: + generator = TimeseriesGenerator( + gen_data, + gen_target, + length=length, + batch_size=batch_size, + shuffle=shuffle, + ) + except ValueError: + self.failed_chunks.append(chunk) + length = len(generator) + generator_containers.append( + TimeseriesGeneratorContainer(generator, chunk, length) + ) + return generator_containers - return np.array(samples), np.array(targets) + def __getitem__(self, index): + i = -1 + for container in self.generators_containers: + new_i = i + container.length + if index <= new_i: + gen_i = index - i - 1 + return container.generator[gen_i] + i = new_i + raise IndexError(index) From 2cf6c3323f4939d4dc32126d72a5fbfd98848b89 Mon Sep 17 00:00:00 2001 From: Serhii Date: Wed, 8 Apr 2020 09:36:11 +0300 Subject: [PATCH 07/61] Fix annotation checks on Github --- gordo/machine/model/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index e27eb21f9..d1161fa8a 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -710,7 +710,7 @@ def __init__( step = pd.Timedelta(minutes=step) self.step = step self.consecutive_chunks = self.find_consecutive_chunks(data) - self.failed_chunks = [] + self.failed_chunks: List[TimeseriesChunk] = [] self.generators_containers = self.create_generator_containers( data, targets, length=length, batch_size=batch_size, shuffle=shuffle ) From 1d7ad98075a8af0df4d7fa747cad8db5f51602d9 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 09:19:33 +0300 Subject: [PATCH 08/61] logger.debug() in GordoTimeseriesGenerator --- gordo/machine/model/models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index d1161fa8a..7967bd6c5 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -710,10 +710,12 @@ def __init__( step = pd.Timedelta(minutes=step) self.step = step self.consecutive_chunks = self.find_consecutive_chunks(data) + logger.debug('GordoTimeseriesGenerator with consecutive_chunks=%s', self.consecutive_chunks) self.failed_chunks: List[TimeseriesChunk] = [] self.generators_containers = self.create_generator_containers( data, targets, length=length, batch_size=batch_size, shuffle=shuffle ) + logger.debug('GordoTimeseriesGenerator with generators_containers=%s', self.generators_containers) def filter_chunks(self, indexes=None): if indexes is not None: From 0fcbe5942e897e1811a2df276967234d38d84c51 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 09:32:58 +0300 Subject: [PATCH 09/61] raise ValueError if the time series in wrong shape --- gordo/machine/model/models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 7967bd6c5..c425fb71b 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -716,6 +716,8 @@ def __init__( data, targets, length=length, batch_size=batch_size, shuffle=shuffle ) logger.debug('GordoTimeseriesGenerator with generators_containers=%s', self.generators_containers) + if not self.generators_containers: + raise ValueError("Seems like the time series are too small or in random order") def filter_chunks(self, indexes=None): if indexes is not None: From aab9f4169e500265e101c9b86da0193281f3147c Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 15:01:07 +0300 Subject: [PATCH 10/61] test_find_consecutive_chunks() --- gordo/machine/model/models.py | 24 ++++----- .../model/test_gordo_timeseries_generator.py | 49 +++++++++++++++++++ 2 files changed, 62 insertions(+), 11 deletions(-) create mode 100644 tests/gordo/machine/model/test_gordo_timeseries_generator.py diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index c425fb71b..9763f4b0c 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -717,7 +717,8 @@ def __init__( ) logger.debug('GordoTimeseriesGenerator with generators_containers=%s', self.generators_containers) if not self.generators_containers: - raise ValueError("Seems like the time series are too small or in random order") + raise ValueError("Seems like the time series are too small or in random order." + "Failed chunks: %s" % self.consecutive_chunks) def filter_chunks(self, indexes=None): if indexes is not None: @@ -730,20 +731,20 @@ def __len__(self): def find_consecutive_chunks(self, df: pd.DataFrame) -> List[TimeseriesChunk]: chunks = [] - prev_ts, start_ts, size = None, None, 0 - for dt in df.index: + prev_ts, start_ts, start_i = None, None, 0 + for i, dt in enumerate(df.index): if prev_ts is None: prev_ts = dt start_ts = dt else: if dt - prev_ts == self.step: - size += 1 prev_ts = dt else: - chunks.append(TimeseriesChunk(start_ts, prev_ts, size)) - prev_ts, start_ts, size = None, None, 0 + chunks.append(TimeseriesChunk(start_ts, prev_ts, i - start_i)) + prev_ts, start_ts = None, None + start_i = i if start_ts is not None: - chunks.append(TimeseriesChunk(start_ts, prev_ts, size)) + chunks.append(TimeseriesChunk(start_ts, prev_ts, len(df.index) - start_i)) return chunks def create_generator_containers( @@ -768,10 +769,11 @@ def create_generator_containers( ) except ValueError: self.failed_chunks.append(chunk) - length = len(generator) - generator_containers.append( - TimeseriesGeneratorContainer(generator, chunk, length) - ) + else: + length = len(generator) + generator_containers.append( + TimeseriesGeneratorContainer(generator, chunk, length) + ) return generator_containers def __getitem__(self, index): diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py new file mode 100644 index 000000000..75c572e84 --- /dev/null +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -0,0 +1,49 @@ +import pandas as pd +from itertools import chain +from random import randrange + +from gordo.machine.model.models import GordoTimeseriesGenerator, TimeseriesChunk + +def get_test_datetimeindex(time_intervals, freq=None): + if freq is None: + freq = 'H' + dti_iters = (pd.date_range(d, periods=p, freq=freq) for d, p in time_intervals) + return pd.DatetimeIndex(list(chain(*dti_iters))) + +def random_gen(min_value=80, max_value=100): + def generate(values_count): + for v in range(values_count): + yield randrange(min_value, max_value) + return generate + +def get_test_df(time_intervals, generator=None, freq=None, tags_count=3): + if generator is None: + generator = random_gen() + dti = get_test_datetimeindex(time_intervals, freq) + tag_names = ['tag%d' % v for v in range(tags_count)] + data = {k: [] for k in tag_names} + generate_count=len(dti) + for _ in range(generate_count): + for tag_name, value in zip(tag_names, generator(tags_count)): + data[tag_name].append(value) + return pd.DataFrame(data, index=dti).sort_index() + +def test_find_consecutive_chunks(): + test1_time_intervals = ( + ('2018-01-01', 8), + ('2018-01-02', 45), + ('2018-01-04', 10), + ('2018-01-05', 30), + ('2018-02-03', 20), + ) + test1_df = get_test_df(test1_time_intervals) + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step=60) + expected_chunks = [TimeseriesChunk(start_ts=pd.Timestamp('2018-01-01 00:00:00'), end_ts=pd.Timestamp('2018-01-01 07:00:00'), size=8), + TimeseriesChunk(start_ts=pd.Timestamp('2018-01-02 01:00:00'), end_ts=pd.Timestamp('2018-01-03 20:00:00'), size=45), + TimeseriesChunk(start_ts=pd.Timestamp('2018-01-04 01:00:00'), end_ts=pd.Timestamp('2018-01-04 09:00:00'), size=10), + TimeseriesChunk(start_ts=pd.Timestamp('2018-01-05 01:00:00'), end_ts=pd.Timestamp('2018-01-06 05:00:00'), size=30), + TimeseriesChunk(start_ts=pd.Timestamp('2018-02-03 01:00:00'), end_ts=pd.Timestamp('2018-02-03 19:00:00'), size=20)] + assert len(gen.consecutive_chunks) == len(expected_chunks) + for chunk, expected_chunk in zip(gen.consecutive_chunks, expected_chunks): + assert chunk == expected_chunk + From 46d73d5e1eab3e75b86b23556ec71ad2878e4e13 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 15:16:55 +0300 Subject: [PATCH 11/61] test_create_generator_containers() --- .../model/test_gordo_timeseries_generator.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py index 75c572e84..70aae1ac2 100644 --- a/tests/gordo/machine/model/test_gordo_timeseries_generator.py +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -47,3 +47,23 @@ def test_find_consecutive_chunks(): for chunk, expected_chunk in zip(gen.consecutive_chunks, expected_chunks): assert chunk == expected_chunk +def test_create_generator_containers(): + test1_time_intervals = ( + ('2018-01-01', 4), + ('2018-01-02', 35), + ('2018-01-04', 10), + ) + test1_df = get_test_df(test1_time_intervals) + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step=60) + expected_generator_containers = [ + {'chunk': TimeseriesChunk(start_ts=pd.Timestamp('2018-01-02 01:00:00'), end_ts=pd.Timestamp('2018-01-03 10:00:00'), size=35), 'length': 1}, + {'chunk': TimeseriesChunk(start_ts=pd.Timestamp('2018-01-04 01:00:00'), end_ts=pd.Timestamp('2018-01-04 09:00:00'), size=10), 'length': 1}, + ] + assert len(gen.generators_containers) == 2 + for i, generator_container in enumerate(gen.generators_containers): + for k, v in expected_generator_containers[i].items(): + assert getattr(generator_container, k) == v, "%s.%s != %s" % (generator_container, k, v) + expected_failed_chunk = TimeseriesChunk(start_ts=pd.Timestamp('2018-01-01 00:00:00'), end_ts=pd.Timestamp('2018-01-01 03:00:00'), size=4) + assert len(gen.failed_chunks) == 1 + assert gen.failed_chunks[0] == expected_failed_chunk + From d4d40ac2a6ffcafdbbd058d00a9400d64c360c83 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 15:51:26 +0300 Subject: [PATCH 12/61] test_timeseries_generator() --- .../model/test_gordo_timeseries_generator.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py index 70aae1ac2..994bca997 100644 --- a/tests/gordo/machine/model/test_gordo_timeseries_generator.py +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -1,6 +1,10 @@ +import pytest + import pandas as pd from itertools import chain from random import randrange +from itertools import count +from numpy import ndarray from gordo.machine.model.models import GordoTimeseriesGenerator, TimeseriesChunk @@ -16,6 +20,14 @@ def generate(values_count): yield randrange(min_value, max_value) return generate +def range_gen(): + g=count() + def generate(values_count): + ret_value = next(g) + for v in range(values_count): + yield ret_value + return generate + def get_test_df(time_intervals, generator=None, freq=None, tags_count=3): if generator is None: generator = random_gen() @@ -67,3 +79,45 @@ def test_create_generator_containers(): assert len(gen.failed_chunks) == 1 assert gen.failed_chunks[0] == expected_failed_chunk +def test_timeseries_generator(): + test1_time_intervals = ( + ('2018-01-02', 15), + ('2018-01-04', 10), + ) + test1_df = get_test_df(test1_time_intervals, generator=range_gen(), tags_count=1) + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, batch_size=3, step=60) + assert len(gen.generators_containers) == 2 + assert len(gen) == 6 + x, y = gen[0] + expect_x=[[[0], + [1], + [2], + [3], + [4]], + [[1], + [2], + [3], + [4], + [5]], + + [[2], + [3], + [4], + [5], + [6]]] + expect_y=[[5], + [6], + [7]] + assert x.tolist() == expect_x + assert y.tolist() == expect_y + +def test_too_short_timeseries_length(): + test1_time_intervals = ( + ('2018-01-01', 4), + ('2018-01-02', 6), + ('2018-01-04', 8), + ) + test1_df = get_test_df(test1_time_intervals) + with pytest.raises(ValueError): + GordoTimeseriesGenerator(test1_df, test1_df, length=10, step=60) + From 0375a56b607c4733051672311dbb1a35ec9dc399 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 15:52:07 +0300 Subject: [PATCH 13/61] black --- gordo/machine/model/models.py | 16 ++- .../model/test_gordo_timeseries_generator.py | 133 ++++++++++++------ 2 files changed, 99 insertions(+), 50 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 9763f4b0c..c57f30910 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -710,15 +710,23 @@ def __init__( step = pd.Timedelta(minutes=step) self.step = step self.consecutive_chunks = self.find_consecutive_chunks(data) - logger.debug('GordoTimeseriesGenerator with consecutive_chunks=%s', self.consecutive_chunks) + logger.debug( + "GordoTimeseriesGenerator with consecutive_chunks=%s", + self.consecutive_chunks, + ) self.failed_chunks: List[TimeseriesChunk] = [] self.generators_containers = self.create_generator_containers( data, targets, length=length, batch_size=batch_size, shuffle=shuffle ) - logger.debug('GordoTimeseriesGenerator with generators_containers=%s', self.generators_containers) + logger.debug( + "GordoTimeseriesGenerator with generators_containers=%s", + self.generators_containers, + ) if not self.generators_containers: - raise ValueError("Seems like the time series are too small or in random order." - "Failed chunks: %s" % self.consecutive_chunks) + raise ValueError( + "Seems like the time series are too small or in random order." + "Failed chunks: %s" % self.consecutive_chunks + ) def filter_chunks(self, indexes=None): if indexes is not None: diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py index 994bca997..4962594ba 100644 --- a/tests/gordo/machine/model/test_gordo_timeseries_generator.py +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -8,116 +8,157 @@ from gordo.machine.model.models import GordoTimeseriesGenerator, TimeseriesChunk + def get_test_datetimeindex(time_intervals, freq=None): if freq is None: - freq = 'H' + freq = "H" dti_iters = (pd.date_range(d, periods=p, freq=freq) for d, p in time_intervals) return pd.DatetimeIndex(list(chain(*dti_iters))) + def random_gen(min_value=80, max_value=100): def generate(values_count): for v in range(values_count): yield randrange(min_value, max_value) + return generate + def range_gen(): - g=count() + g = count() + def generate(values_count): ret_value = next(g) for v in range(values_count): yield ret_value + return generate + def get_test_df(time_intervals, generator=None, freq=None, tags_count=3): if generator is None: generator = random_gen() dti = get_test_datetimeindex(time_intervals, freq) - tag_names = ['tag%d' % v for v in range(tags_count)] + tag_names = ["tag%d" % v for v in range(tags_count)] data = {k: [] for k in tag_names} - generate_count=len(dti) + generate_count = len(dti) for _ in range(generate_count): for tag_name, value in zip(tag_names, generator(tags_count)): data[tag_name].append(value) return pd.DataFrame(data, index=dti).sort_index() + def test_find_consecutive_chunks(): test1_time_intervals = ( - ('2018-01-01', 8), - ('2018-01-02', 45), - ('2018-01-04', 10), - ('2018-01-05', 30), - ('2018-02-03', 20), + ("2018-01-01", 8), + ("2018-01-02", 45), + ("2018-01-04", 10), + ("2018-01-05", 30), + ("2018-02-03", 20), ) test1_df = get_test_df(test1_time_intervals) gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step=60) - expected_chunks = [TimeseriesChunk(start_ts=pd.Timestamp('2018-01-01 00:00:00'), end_ts=pd.Timestamp('2018-01-01 07:00:00'), size=8), - TimeseriesChunk(start_ts=pd.Timestamp('2018-01-02 01:00:00'), end_ts=pd.Timestamp('2018-01-03 20:00:00'), size=45), - TimeseriesChunk(start_ts=pd.Timestamp('2018-01-04 01:00:00'), end_ts=pd.Timestamp('2018-01-04 09:00:00'), size=10), - TimeseriesChunk(start_ts=pd.Timestamp('2018-01-05 01:00:00'), end_ts=pd.Timestamp('2018-01-06 05:00:00'), size=30), - TimeseriesChunk(start_ts=pd.Timestamp('2018-02-03 01:00:00'), end_ts=pd.Timestamp('2018-02-03 19:00:00'), size=20)] + expected_chunks = [ + TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-01 00:00:00"), + end_ts=pd.Timestamp("2018-01-01 07:00:00"), + size=8, + ), + TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-02 01:00:00"), + end_ts=pd.Timestamp("2018-01-03 20:00:00"), + size=45, + ), + TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-04 01:00:00"), + end_ts=pd.Timestamp("2018-01-04 09:00:00"), + size=10, + ), + TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-05 01:00:00"), + end_ts=pd.Timestamp("2018-01-06 05:00:00"), + size=30, + ), + TimeseriesChunk( + start_ts=pd.Timestamp("2018-02-03 01:00:00"), + end_ts=pd.Timestamp("2018-02-03 19:00:00"), + size=20, + ), + ] assert len(gen.consecutive_chunks) == len(expected_chunks) for chunk, expected_chunk in zip(gen.consecutive_chunks, expected_chunks): assert chunk == expected_chunk + def test_create_generator_containers(): test1_time_intervals = ( - ('2018-01-01', 4), - ('2018-01-02', 35), - ('2018-01-04', 10), + ("2018-01-01", 4), + ("2018-01-02", 35), + ("2018-01-04", 10), ) test1_df = get_test_df(test1_time_intervals) gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step=60) expected_generator_containers = [ - {'chunk': TimeseriesChunk(start_ts=pd.Timestamp('2018-01-02 01:00:00'), end_ts=pd.Timestamp('2018-01-03 10:00:00'), size=35), 'length': 1}, - {'chunk': TimeseriesChunk(start_ts=pd.Timestamp('2018-01-04 01:00:00'), end_ts=pd.Timestamp('2018-01-04 09:00:00'), size=10), 'length': 1}, + { + "chunk": TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-02 01:00:00"), + end_ts=pd.Timestamp("2018-01-03 10:00:00"), + size=35, + ), + "length": 1, + }, + { + "chunk": TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-04 01:00:00"), + end_ts=pd.Timestamp("2018-01-04 09:00:00"), + size=10, + ), + "length": 1, + }, ] assert len(gen.generators_containers) == 2 for i, generator_container in enumerate(gen.generators_containers): for k, v in expected_generator_containers[i].items(): - assert getattr(generator_container, k) == v, "%s.%s != %s" % (generator_container, k, v) - expected_failed_chunk = TimeseriesChunk(start_ts=pd.Timestamp('2018-01-01 00:00:00'), end_ts=pd.Timestamp('2018-01-01 03:00:00'), size=4) + assert getattr(generator_container, k) == v, "%s.%s != %s" % ( + generator_container, + k, + v, + ) + expected_failed_chunk = TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-01 00:00:00"), + end_ts=pd.Timestamp("2018-01-01 03:00:00"), + size=4, + ) assert len(gen.failed_chunks) == 1 assert gen.failed_chunks[0] == expected_failed_chunk + def test_timeseries_generator(): test1_time_intervals = ( - ('2018-01-02', 15), - ('2018-01-04', 10), + ("2018-01-02", 15), + ("2018-01-04", 10), ) test1_df = get_test_df(test1_time_intervals, generator=range_gen(), tags_count=1) gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, batch_size=3, step=60) assert len(gen.generators_containers) == 2 assert len(gen) == 6 x, y = gen[0] - expect_x=[[[0], - [1], - [2], - [3], - [4]], - [[1], - [2], - [3], - [4], - [5]], - - [[2], - [3], - [4], - [5], - [6]]] - expect_y=[[5], - [6], - [7]] + expect_x = [ + [[0], [1], [2], [3], [4]], + [[1], [2], [3], [4], [5]], + [[2], [3], [4], [5], [6]], + ] + expect_y = [[5], [6], [7]] assert x.tolist() == expect_x assert y.tolist() == expect_y + def test_too_short_timeseries_length(): test1_time_intervals = ( - ('2018-01-01', 4), - ('2018-01-02', 6), - ('2018-01-04', 8), + ("2018-01-01", 4), + ("2018-01-02", 6), + ("2018-01-04", 8), ) test1_df = get_test_df(test1_time_intervals) with pytest.raises(ValueError): GordoTimeseriesGenerator(test1_df, test1_df, length=10, step=60) - From 1a47bc50dfe9d28d2c4bc8f9c4c5c449d33654fa Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 15:52:27 +0300 Subject: [PATCH 14/61] Remove unused import --- tests/gordo/machine/model/test_gordo_timeseries_generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py index 4962594ba..660f1a7c4 100644 --- a/tests/gordo/machine/model/test_gordo_timeseries_generator.py +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -4,7 +4,6 @@ from itertools import chain from random import randrange from itertools import count -from numpy import ndarray from gordo.machine.model.models import GordoTimeseriesGenerator, TimeseriesChunk From 323eda446377defe77e22231ba403b84477ba90c Mon Sep 17 00:00:00 2001 From: Serhii Date: Fri, 10 Apr 2020 10:02:16 +0300 Subject: [PATCH 15/61] Consecutive index for RandomDataset --- .../dataset/data_provider/providers.py | 37 +++++++++++++++---- gordo/machine/dataset/datasets.py | 6 ++- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/gordo/machine/dataset/data_provider/providers.py b/gordo/machine/dataset/data_provider/providers.py index 7d3e19150..c069b0c23 100644 --- a/gordo/machine/dataset/data_provider/providers.py +++ b/gordo/machine/dataset/data_provider/providers.py @@ -4,6 +4,7 @@ import logging import threading import timeit +import math from datetime import datetime import typing @@ -351,9 +352,20 @@ def can_handle_tag(self, tag: SensorTag): return True # We can be random about everything @capture_args - def __init__(self, min_size=100, max_size=300, **kwargs): + def __init__( + self, + min_size=100, + max_size=300, + randomize_dates=True, + consecutive_freq=None, + **kwargs, + ): self.max_size = max_size self.min_size = min_size + self.randomize_dates = randomize_dates + self.consecutive_freq = ( + consecutive_freq if consecutive_freq is not None else "10min" + ) np.random.seed(0) # Thanks stackoverflow @@ -369,6 +381,15 @@ def _random_dates(start, end, n=10): pd.to_datetime(np.random.randint(start_u, end_u, n), unit="s", utc=True) ) + @staticmethod + def _consecutive_dates(start, end, freq): + start = pd.to_datetime(start) + end = pd.to_datetime(end) + step = pd.to_timedelta(freq) + periods = int(math.floor((end - start) / step)) + dr = pd.date_range(start, periods=periods, freq=freq) + return pd.DatetimeIndex(dr) + def load_series( self, train_start_date: datetime, @@ -381,12 +402,14 @@ def load_series( "Dry run for RandomDataProvider is not implemented" ) for tag in tag_list: - nr = random.randint(self.min_size, self.max_size) - - random_index = self._random_dates(train_start_date, train_end_date, n=nr) + if self.randomize_dates: + nr = random.randint(self.min_size, self.max_size) + index = self._random_dates(train_start_date, train_end_date, n=nr) + else: + index = self._consecutive_dates( + train_start_date, train_end_date, freq=self.consecutive_freq + ) series = pd.Series( - index=random_index, - name=tag.name, - data=np.random.random(size=len(random_index)), + index=index, name=tag.name, data=np.random.random(size=len(index)), ) yield series diff --git a/gordo/machine/dataset/datasets.py b/gordo/machine/dataset/datasets.py index 1e832d61a..6965315b5 100644 --- a/gordo/machine/dataset/datasets.py +++ b/gordo/machine/dataset/datasets.py @@ -251,11 +251,15 @@ def __init__( train_start_date: Union[datetime, str], train_end_date: Union[datetime, str], tag_list: list, + randomize_dates: bool = True, + consecutive_freq: Optional[str] = None, **kwargs, ): kwargs.pop("data_provider", None) # Dont care what you ask for, you get random! super().__init__( - data_provider=RandomDataProvider(), + data_provider=RandomDataProvider( + randomize_dates=randomize_dates, consecutive_freq=consecutive_freq + ), train_start_date=train_start_date, train_end_date=train_end_date, tag_list=tag_list, From a76152d3b6a82482488336048c9ee18eb266bb15 Mon Sep 17 00:00:00 2001 From: Serhii Date: Fri, 10 Apr 2020 10:40:35 +0300 Subject: [PATCH 16/61] test_random_data_provider_consecutive() --- gordo/machine/dataset/data_provider/providers.py | 3 ++- .../dataset/data_provider/test_data_providers.py | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/gordo/machine/dataset/data_provider/providers.py b/gordo/machine/dataset/data_provider/providers.py index c069b0c23..ca489261f 100644 --- a/gordo/machine/dataset/data_provider/providers.py +++ b/gordo/machine/dataset/data_provider/providers.py @@ -7,6 +7,7 @@ import math from datetime import datetime +from dateutil.tz import tzutc import typing from cachetools import cached, TTLCache @@ -387,7 +388,7 @@ def _consecutive_dates(start, end, freq): end = pd.to_datetime(end) step = pd.to_timedelta(freq) periods = int(math.floor((end - start) / step)) - dr = pd.date_range(start, periods=periods, freq=freq) + dr = pd.date_range(start, periods=periods, freq=freq, tz=tzutc()) return pd.DatetimeIndex(dr) def load_series( diff --git a/tests/gordo/machine/dataset/data_provider/test_data_providers.py b/tests/gordo/machine/dataset/data_provider/test_data_providers.py index 294c7ddaf..2ea1b249b 100644 --- a/tests/gordo/machine/dataset/data_provider/test_data_providers.py +++ b/tests/gordo/machine/dataset/data_provider/test_data_providers.py @@ -1,6 +1,7 @@ import re import unittest from datetime import datetime +from dateutil.parser import isoparse from typing import Iterable, List, Pattern, Any import pandas as pd @@ -9,7 +10,7 @@ from gordo.machine.dataset.data_provider.base import GordoBaseDataProvider from gordo.machine.dataset.data_provider import providers from gordo.machine.dataset.data_provider.providers import ( - load_series_from_multiple_providers, + load_series_from_multiple_providers, RandomDataProvider ) from gordo.machine.dataset.sensor_tag import SensorTag @@ -128,3 +129,15 @@ def test_data_provider_serializations( # Should be able to recreate the object from encoded directly cloned = provider.__class__.from_dict(encoded) assert type(cloned) == type(provider) + +def test_random_data_provider_consecutive(): + provider = RandomDataProvider(randomize_dates=False, consecutive_freq='10min') + tag_list = [SensorTag('tag1', '')] + series = list(provider.load_series(isoparse('2020-04-01 00:00:00'), isoparse('2020-04-01 01:00:00'), tag_list=tag_list)) + assert len(series) == 1 + first_series = series[0] + assert len(first_series) == 6 + assert first_series.index[0] == pd.to_datetime('2020-04-01 00:00:00+00:00') + assert first_series.index[5] == pd.to_datetime('2020-04-01 00:50:00+00:00') + + From 01c8a1aaf9f57039f886bf21d39d951c9669da82 Mon Sep 17 00:00:00 2001 From: Serhii Date: Fri, 10 Apr 2020 10:41:14 +0300 Subject: [PATCH 17/61] black --- .../data_provider/test_data_providers.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/gordo/machine/dataset/data_provider/test_data_providers.py b/tests/gordo/machine/dataset/data_provider/test_data_providers.py index 2ea1b249b..025ee3920 100644 --- a/tests/gordo/machine/dataset/data_provider/test_data_providers.py +++ b/tests/gordo/machine/dataset/data_provider/test_data_providers.py @@ -10,7 +10,8 @@ from gordo.machine.dataset.data_provider.base import GordoBaseDataProvider from gordo.machine.dataset.data_provider import providers from gordo.machine.dataset.data_provider.providers import ( - load_series_from_multiple_providers, RandomDataProvider + load_series_from_multiple_providers, + RandomDataProvider, ) from gordo.machine.dataset.sensor_tag import SensorTag @@ -130,14 +131,19 @@ def test_data_provider_serializations( cloned = provider.__class__.from_dict(encoded) assert type(cloned) == type(provider) + def test_random_data_provider_consecutive(): - provider = RandomDataProvider(randomize_dates=False, consecutive_freq='10min') - tag_list = [SensorTag('tag1', '')] - series = list(provider.load_series(isoparse('2020-04-01 00:00:00'), isoparse('2020-04-01 01:00:00'), tag_list=tag_list)) + provider = RandomDataProvider(randomize_dates=False, consecutive_freq="10min") + tag_list = [SensorTag("tag1", "")] + series = list( + provider.load_series( + isoparse("2020-04-01 00:00:00"), + isoparse("2020-04-01 01:00:00"), + tag_list=tag_list, + ) + ) assert len(series) == 1 first_series = series[0] assert len(first_series) == 6 - assert first_series.index[0] == pd.to_datetime('2020-04-01 00:00:00+00:00') - assert first_series.index[5] == pd.to_datetime('2020-04-01 00:50:00+00:00') - - + assert first_series.index[0] == pd.to_datetime("2020-04-01 00:00:00+00:00") + assert first_series.index[5] == pd.to_datetime("2020-04-01 00:50:00+00:00") From 1f4fd1486fdb59bd6a5162f445e888320a2247b4 Mon Sep 17 00:00:00 2001 From: Serhii Date: Fri, 10 Apr 2020 14:25:52 +0300 Subject: [PATCH 18/61] timeseries_generators() --- gordo/machine/model/models.py | 69 +++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index c57f30910..ba4a71fa7 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -7,6 +7,7 @@ from typing import Union, Callable, Dict, Any, Optional, Tuple, List from abc import ABCMeta from dataclasses import dataclass +from copy import copy import h5py import tensorflow.keras.models @@ -333,6 +334,7 @@ def __init__( kind: Union[Callable, str], lookback_window: int = 1, batch_size: int = 32, + timeseries_generator: Optional[Dict[str, Any]] = None, **kwargs, ) -> None: """ @@ -365,6 +367,8 @@ def __init__( kwargs["kind"] = kind kwargs["batch_size"] = batch_size + self.timeseries_generator_config = timeseries_generator + # fit_generator_params is a set of strings with the keyword arguments of # Keras fit_generator method (excluding "shuffle" as this will be hardcoded). # This will be used in the fit method of the respective subclasses to match @@ -456,6 +460,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": batch_size=1, lookback_window=self.lookback_window, lookahead=self.lookahead, + config=self.timeseries_generator_config, ) primer_x, primer_y = tsg[0] @@ -468,6 +473,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": batch_size=self.batch_size, lookback_window=self.lookback_window, lookahead=self.lookahead, + config=self.timeseries_generator_config, ) gen_kwargs = { @@ -605,6 +611,7 @@ def create_keras_timeseriesgenerator( batch_size: int, lookback_window: int, lookahead: int, + config: Optional[Dict[str, Any]] = None, ) -> TimeseriesGenerator: """ Provides a `keras.preprocessing.sequence.TimeseriesGenerator` for use with @@ -657,19 +664,47 @@ def create_keras_timeseriesgenerator( >>> len(gen[0][0][0][0]) # n_features = 2 2 """ + X, y = pad_x_and_y(X, y, lookahead) + return timeseries_generators.create_from_config( + config, data=X, targets=y, length=lookback_window, batch_size=batch_size + ) - if isinstance(X, pd.DataFrame): - if not isinstance(y, pd.DataFrame): - raise ValueError("'y' should be an instance of pandas.DataFrame") - # TODO padding for X and y - return GordoTimeseriesGenerator( - data=X, targets=y, length=lookback_window, batch_size=batch_size - ) - else: - X, y = pad_x_and_y(X, y, lookahead) - return TimeseriesGenerator( - data=X, targets=y, length=lookback_window, batch_size=batch_size - ) + +class TimeseriesGeneratorTypes: + def __init__(self, default_type): + self.default_type = default_type + self._types = {} + + def create_from_config(self, config, **kwargs): + if config is None: + return self.default_type(**kwargs) + else: + if "type" not in config: + raise ValueError( + 'Unspecified "type" attribute for "timeseries_generator"' + ) + type_name = config["type"] + if type_name not in self._types: + raise ValueError( + f'Unknown type "{type_name}" for "timeseries_generator"' + ) + all_kwargs = copy(config).pop("type") + all_kwargs.update(kwargs) + return self._types[type_name](**all_kwargs) + + def __call__(self, type_name): + def wrap(cls): + if type_name in self._types: + raise ValueError( + f'TimeseriesGenerator type with name "{type_name}" already exists' + ) + self._types[type_name] = cls + return cls + + return wrap + + +timeseries_generators = TimeseriesGeneratorTypes(default_type=TimeseriesGenerator) @dataclass @@ -686,17 +721,21 @@ class TimeseriesGeneratorContainer: length: int +@timeseries_generators("GordoTimeseriesGenerator") class GordoTimeseriesGenerator(data_utils.Sequence): def __init__( self, - data: pd.DataFrame, - targets: pd.DataFrame, + data: Union[pd.DataFrame, np.ndarray], + targets: Union[pd.DataFrame, np.ndarray], length: int, batch_size: int = 128, shuffle: bool = False, step: Optional[Union[pd.Timedelta, int]] = None, ): - + if not isinstance(data, pd.DataFrame): + raise ValueError("Data have to be instance of pandas.DataFrame") + if not isinstance(targets, pd.DataFrame): + raise ValueError("Targets have to be instance of pandas.DataFrame") if len(data) != len(targets): raise ValueError( "Data and targets have to be of same length. " From e6935219228efdf5e9b1a0c0a8046f881efcb393 Mon Sep 17 00:00:00 2001 From: Serhii Date: Fri, 10 Apr 2020 14:43:38 +0300 Subject: [PATCH 19/61] Use names of pd.Timedelta instead of number of minutes in config --- gordo/machine/model/models.py | 8 +++----- .../machine/model/test_gordo_timeseries_generator.py | 8 ++++---- tests/gordo/machine/model/test_model.py | 2 ++ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index ba4a71fa7..8ad9bd711 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -730,7 +730,7 @@ def __init__( length: int, batch_size: int = 128, shuffle: bool = False, - step: Optional[Union[pd.Timedelta, int]] = None, + step: Union[pd.Timedelta, str] = '10min', ): if not isinstance(data, pd.DataFrame): raise ValueError("Data have to be instance of pandas.DataFrame") @@ -743,10 +743,8 @@ def __init__( f" while target length is {len(targets)}" ) - if step is None: - step = pd.Timedelta(minutes=10) - if isinstance(step, int): - step = pd.Timedelta(minutes=step) + if isinstance(step, str): + step = pd.to_timedelta(step) self.step = step self.consecutive_chunks = self.find_consecutive_chunks(data) logger.debug( diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py index 660f1a7c4..6d763fe79 100644 --- a/tests/gordo/machine/model/test_gordo_timeseries_generator.py +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -56,7 +56,7 @@ def test_find_consecutive_chunks(): ("2018-02-03", 20), ) test1_df = get_test_df(test1_time_intervals) - gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step=60) + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step='60min') expected_chunks = [ TimeseriesChunk( start_ts=pd.Timestamp("2018-01-01 00:00:00"), @@ -96,7 +96,7 @@ def test_create_generator_containers(): ("2018-01-04", 10), ) test1_df = get_test_df(test1_time_intervals) - gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step=60) + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step='60min') expected_generator_containers = [ { "chunk": TimeseriesChunk( @@ -138,7 +138,7 @@ def test_timeseries_generator(): ("2018-01-04", 10), ) test1_df = get_test_df(test1_time_intervals, generator=range_gen(), tags_count=1) - gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, batch_size=3, step=60) + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, batch_size=3, step='60min') assert len(gen.generators_containers) == 2 assert len(gen) == 6 x, y = gen[0] @@ -160,4 +160,4 @@ def test_too_short_timeseries_length(): ) test1_df = get_test_df(test1_time_intervals) with pytest.raises(ValueError): - GordoTimeseriesGenerator(test1_df, test1_df, length=10, step=60) + GordoTimeseriesGenerator(test1_df, test1_df, length=10, step='60min') diff --git a/tests/gordo/machine/model/test_model.py b/tests/gordo/machine/model/test_model.py index 29f3bae4c..dae3028c8 100644 --- a/tests/gordo/machine/model/test_model.py +++ b/tests/gordo/machine/model/test_model.py @@ -333,3 +333,5 @@ def test_lstmae_predict_output(): xTest = np.random.random(size=(4, 3)) out = model.predict(xTest) assert out.shape == (2, 3) + +#TODO test with GordoTimeseriesGenerator \ No newline at end of file From 409070c4a53fc1af6a7dcef933fcfd06c8b43062 Mon Sep 17 00:00:00 2001 From: Serhii Date: Fri, 10 Apr 2020 18:29:59 +0300 Subject: [PATCH 20/61] Fix issues with of GordoTimeseriesGenerator and local_build() --- gordo/machine/model/models.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 8ad9bd711..9540b050a 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -366,8 +366,7 @@ def __init__( kwargs["lookback_window"] = lookback_window kwargs["kind"] = kind kwargs["batch_size"] = batch_size - - self.timeseries_generator_config = timeseries_generator + kwargs["timeseries_generator"] = timeseries_generator # fit_generator_params is a set of strings with the keyword arguments of # Keras fit_generator method (excluding "shuffle" as this will be hardcoded). @@ -398,6 +397,10 @@ def lookahead(self) -> int: """Steps ahead in y the model should target""" ... + @property + def timeseries_generator(self): + return self.kwargs.get('timeseries_generator', None) + def get_metadata(self): """ Add number of forecast steps to metadata @@ -460,7 +463,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": batch_size=1, lookback_window=self.lookback_window, lookahead=self.lookahead, - config=self.timeseries_generator_config, + config=self.timeseries_generator, ) primer_x, primer_y = tsg[0] @@ -473,7 +476,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": batch_size=self.batch_size, lookback_window=self.lookback_window, lookahead=self.lookahead, - config=self.timeseries_generator_config, + config=self.timeseries_generator, ) gen_kwargs = { @@ -533,6 +536,7 @@ def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: batch_size=10000, lookback_window=self.lookback_window, lookahead=self.lookahead, + config=self.timeseries_generator, ) return self.model.predict_generator(tsg) @@ -688,7 +692,8 @@ def create_from_config(self, config, **kwargs): raise ValueError( f'Unknown type "{type_name}" for "timeseries_generator"' ) - all_kwargs = copy(config).pop("type") + all_kwargs = copy(config) + all_kwargs.pop("type") all_kwargs.update(kwargs) return self._types[type_name](**all_kwargs) From c0ec8548e2ea2e91ce14a6f16afdf5585a8828e4 Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 12:11:16 +0300 Subject: [PATCH 21/61] sklearn-pandas~=1.8.0 --- requirements/full_requirements.txt | 67 +++++++++++++++--------------- requirements/requirements.in | 1 + 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/requirements/full_requirements.txt b/requirements/full_requirements.txt index 5ddd1ef4e..2db4f2524 100644 --- a/requirements/full_requirements.txt +++ b/requirements/full_requirements.txt @@ -11,37 +11,37 @@ aniso8601==8.0.0 # via flask-restplus astor==0.8.1 # via tensorflow attrs==19.3.0 # via jsonschema azure-common==1.1.24 # via azure-graphrbac, azure-mgmt-authorization, azure-mgmt-containerregistry, azure-mgmt-keyvault, azure-mgmt-resource, azure-mgmt-storage, azureml-core -azure-datalake-store==0.0.48 +azure-datalake-store==0.0.48 # via -r requirements.in azure-graphrbac==0.61.1 # via azureml-core azure-mgmt-authorization==0.60.0 # via azureml-core azure-mgmt-containerregistry==2.8.0 # via azureml-core azure-mgmt-keyvault==2.0.0 # via azureml-core azure-mgmt-resource==8.0.0 # via azureml-core azure-mgmt-storage==7.1.0 # via azureml-core -azureml-contrib-run==1.0.85 +azureml-contrib-run==1.0.85 # via -r mlflow_requirements.in azureml-core==1.0.85 # via azureml-mlflow azureml-mlflow==1.0.85 # via azureml-contrib-run backports.tempfile==1.0 # via azureml-core backports.weakref==1.0.post1 # via backports.tempfile -cachetools==4.0.0 -catboost==0.20.2 -cchardet==2.1.5 +cachetools==4.0.0 # via -r requirements.in, google-auth +catboost==0.20.2 # via -r requirements.in +cchardet==2.1.5 # via -r requirements.in certifi==2019.11.28 # via msrest, requests cffi==1.13.2 # via azure-datalake-store, cryptography chardet==3.0.4 # via requests -click==7.0 +click==7.0 # via -r requirements.in, databricks-cli, flask, mlflow cloudpickle==1.2.2 # via mlflow configparser==4.0.2 # via databricks-cli contextlib2==0.6.0.post1 # via azureml-core cryptography==2.8 # via adal, azureml-core, pyopenssl, secretstorage cycler==0.10.0 # via matplotlib databricks-cli==0.9.1 # via mlflow -dataclasses-json==0.3.7 -dictdiffer==0.8.1 +dataclasses-json==0.3.7 # via -r requirements.in +dictdiffer==0.8.1 # via -r requirements.in docker==4.1.0 # via azureml-core, mlflow entrypoints==0.3 # via mlflow -flask-restplus==0.13.0 -flask==1.1.1 +flask-restplus==0.13.0 # via -r requirements.in +flask==1.1.1 # via -r requirements.in, flask-restplus, mlflow, prometheus-flask-exporter gast==0.2.2 # via tensorflow gitdb2==2.0.6 # via gitpython gitpython==3.0.5 # via mlflow @@ -51,15 +51,15 @@ google-pasta==0.1.8 # via tensorflow gorilla==0.3.0 # via mlflow graphviz==0.13.2 # via catboost grpcio==1.26.0 # via tensorboard, tensorflow -gunicorn==20.0.4 -h5py==2.10.0 +gunicorn==20.0.4 # via -r requirements.in, mlflow +h5py==2.10.0 # via -r requirements.in, keras-applications idna==2.8 # via requests importlib-metadata==1.4.0 # via jsonschema -influxdb==5.2.3 +influxdb==5.2.3 # via -r requirements.in isodate==0.6.0 # via msrest itsdangerous==1.1.0 # via flask jeepney==0.4.2 # via secretstorage -jinja2==2.10.3 +jinja2==2.10.3 # via -r requirements.in, flask jmespath==0.9.4 # via azureml-core joblib==0.14.1 # via scikit-learn jsonpickle==1.2 # via azureml-core, azureml-mlflow @@ -73,25 +73,25 @@ markupsafe==1.1.1 # via jinja2, mako marshmallow-enum==1.5.1 # via dataclasses-json marshmallow==3.3.0 # via dataclasses-json, marshmallow-enum matplotlib==3.1.2 # via catboost -mlflow==1.5.0 +mlflow==1.5.0 # via -r mlflow_requirements.in, azureml-mlflow more-itertools==8.1.0 # via zipp msrest==0.6.10 # via azure-graphrbac, azure-mgmt-authorization, azure-mgmt-containerregistry, azure-mgmt-keyvault, azure-mgmt-resource, azure-mgmt-storage, azureml-core, msrestazure msrestazure==0.6.2 # via azure-graphrbac, azure-mgmt-authorization, azure-mgmt-containerregistry, azure-mgmt-keyvault, azure-mgmt-resource, azure-mgmt-storage, azureml-core mypy-extensions==0.4.3 # via typing-inspect ndg-httpsclient==0.5.1 # via azureml-core -numexpr==2.7.1 -numpy==1.18.1 +numexpr==2.7.1 # via -r requirements.in +numpy==1.18.1 # via -r requirements.in, catboost, h5py, keras-applications, keras-preprocessing, matplotlib, mlflow, numexpr, opt-einsum, pandas, pyarrow, scikit-learn, scipy, sklearn-pandas, tensorboard, tensorflow oauthlib==3.1.0 # via requests-oauthlib opt-einsum==3.1.0 # via tensorflow -pandas==1.0.0 +pandas==1.0.0 # via -r requirements.in, catboost, mlflow, sklearn-pandas pathspec==0.7.0 # via azureml-core -peewee==3.13.1 +peewee==3.13.1 # via -r postgres_requirements.in plotly==4.4.1 # via catboost prometheus-client==0.7.1 # via prometheus-flask-exporter prometheus-flask-exporter==0.12.1 # via mlflow protobuf==3.11.2 # via mlflow, tensorboard, tensorflow -psycopg2-binary==2.8.4 -pyarrow==0.15.1 +psycopg2-binary==2.8.4 # via -r postgres_requirements.in +pyarrow==0.15.1 # via -r requirements.in pyasn1-modules==0.2.8 # via google-auth pyasn1==0.4.8 # via ndg-httpsclient, pyasn1-modules, rsa pycparser==2.19 # via cffi @@ -99,21 +99,22 @@ pyjwt==1.7.1 # via adal, azureml-core pyopenssl==19.1.0 # via azureml-core, ndg-httpsclient pyparsing==2.4.6 # via matplotlib pyrsistent==0.15.7 # via jsonschema -python-dateutil==2.8.1 +python-dateutil==2.8.1 # via -r requirements.in, adal, alembic, azureml-core, influxdb, matplotlib, mlflow, pandas python-editor==1.0.4 # via alembic pytz==2019.3 # via azureml-core, flask-restplus, influxdb, pandas -pyyaml==5.3 +pyyaml==5.3 # via -r requirements.in, mlflow querystring-parser==1.2.4 # via mlflow requests-oauthlib==1.3.0 # via google-auth-oauthlib, msrest -requests==2.22.0 +requests==2.22.0 # via -r requirements.in, adal, azure-datalake-store, azureml-core, databricks-cli, docker, influxdb, mlflow, msrest, requests-oauthlib, tensorboard retrying==1.3.3 # via plotly rsa==4.0 # via google-auth ruamel.yaml==0.15.89 # via azureml-core -scikit-learn==0.22.1 -scipy==1.4.1 # via catboost, scikit-learn, tensorflow +scikit-learn==0.22.1 # via -r requirements.in, sklearn-pandas +scipy==1.4.1 # via catboost, scikit-learn, sklearn-pandas, tensorflow secretstorage==3.1.2 # via azureml-core -simplejson==3.17.0 -six==1.14.0 # via absl-py, azureml-core, catboost, cryptography, cycler, databricks-cli, docker, flask-restplus, google-auth, google-pasta, grpcio, h5py, influxdb, isodate, jsonschema, keras-preprocessing, mlflow, plotly, protobuf, pyarrow, pyopenssl, pyrsistent, python-dateutil, querystring-parser, retrying, tensorboard, tensorflow, websocket-client +simplejson==3.17.0 # via -r requirements.in, mlflow +six==1.14.0 # via absl-py, azureml-core, catboost, cryptography, cycler, databricks-cli, docker, flask-restplus, google-auth, google-pasta, grpcio, h5py, influxdb, jsonschema, keras-preprocessing, mlflow, plotly, protobuf, pyarrow, pyopenssl, pyrsistent, python-dateutil, querystring-parser, retrying, tensorboard, tensorflow, websocket-client +sklearn-pandas==1.8.0 # via -r requirements.in smmap2==2.0.5 # via gitdb2 sqlalchemy==1.3.13 # via alembic, mlflow sqlparse==0.3.0 # via mlflow @@ -121,15 +122,15 @@ stringcase==1.2.0 # via dataclasses-json tabulate==0.8.6 # via databricks-cli tensorboard==2.1.0 # via tensorflow tensorflow-estimator==2.1.0 # via tensorflow -tensorflow==2.1.0 +tensorflow==2.1.0 # via -r requirements.in termcolor==1.1.0 # via tensorflow -typing-extensions==3.7.4.1 +typing-extensions==3.7.4.1 # via -r requirements.in, typing-inspect typing-inspect==0.5.0 # via dataclasses-json -urllib3==1.25.7 +urllib3==1.25.7 # via -r requirements.in, azureml-core, requests websocket-client==0.57.0 # via docker -werkzeug==0.16.1 +werkzeug==0.16.1 # via -r requirements.in, flask, tensorboard wheel==0.33.6 # via tensorboard, tensorflow -wrapt==1.11.2 +wrapt==1.11.2 # via -r requirements.in, tensorflow zipp==2.0.0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements/requirements.in b/requirements/requirements.in index b993f3c4e..4c67e9185 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -25,3 +25,4 @@ simplejson~=3.17 catboost~=0.20 typing_extensions~=3.7 wrapt~=1.11 +sklearn-pandas~=1.8.0 From d2086134a06ccc6c702a593d7c500ea53c561c1a Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 13:38:03 +0300 Subject: [PATCH 22/61] Play around with DataFrameMapper --- gordo/machine/model/data_frame_mapper.py | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 gordo/machine/model/data_frame_mapper.py diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/machine/model/data_frame_mapper.py new file mode 100644 index 000000000..d39269059 --- /dev/null +++ b/gordo/machine/model/data_frame_mapper.py @@ -0,0 +1,29 @@ +from pydoc import locate +from sklearn_pandas import DataFrameMapper, gen_features +from copy import copy +from typing import List, Union, Optional + + +class DataFrameMapper(DataFrameMapper): + + _default_kwargs = { + "df_out": True + } + + def __init__(self, columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, **kwargs): + if classes is not None: + classes = copy(classes) + self._prepare_classes(classes) + features = gen_features(columns=columns, classes=classes) + base_kwargs = copy(self._default_kwargs) + base_kwargs.update(kwargs) + super().__init__(features=features, **kwargs) + + @staticmethod + def _prepare_classes(classes: List[dict]): + for i, v in enumerate(classes): + if "class" not in v: + raise ValueError("\"class\" attribute is empty") + if isinstance(v["class"], str): + cls = locate(v["class"]) + classes[i]["class"] = cls From 813d4079b951b4544213474ab3d2bb894d185914 Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 13:39:25 +0300 Subject: [PATCH 23/61] Fix kwargs argument for DataFrameMapper --- gordo/machine/model/data_frame_mapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/machine/model/data_frame_mapper.py index d39269059..ff76a51dd 100644 --- a/gordo/machine/model/data_frame_mapper.py +++ b/gordo/machine/model/data_frame_mapper.py @@ -17,7 +17,7 @@ def __init__(self, columns: List[Union[str, List[str]]], classes: Optional[List[ features = gen_features(columns=columns, classes=classes) base_kwargs = copy(self._default_kwargs) base_kwargs.update(kwargs) - super().__init__(features=features, **kwargs) + super().__init__(features=features, **base_kwargs) @staticmethod def _prepare_classes(classes: List[dict]): From 46413a067e190d81e4afeb33c35ceb0d249cc8f2 Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 13:40:00 +0300 Subject: [PATCH 24/61] black --- gordo/machine/model/data_frame_mapper.py | 13 ++++++++----- gordo/machine/model/models.py | 4 ++-- .../model/test_gordo_timeseries_generator.py | 10 ++++++---- tests/gordo/machine/model/test_model.py | 3 ++- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/machine/model/data_frame_mapper.py index ff76a51dd..dd74be58c 100644 --- a/gordo/machine/model/data_frame_mapper.py +++ b/gordo/machine/model/data_frame_mapper.py @@ -6,11 +6,14 @@ class DataFrameMapper(DataFrameMapper): - _default_kwargs = { - "df_out": True - } + _default_kwargs = {"df_out": True} - def __init__(self, columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, **kwargs): + def __init__( + self, + columns: List[Union[str, List[str]]], + classes: Optional[List[dict]] = None, + **kwargs + ): if classes is not None: classes = copy(classes) self._prepare_classes(classes) @@ -23,7 +26,7 @@ def __init__(self, columns: List[Union[str, List[str]]], classes: Optional[List[ def _prepare_classes(classes: List[dict]): for i, v in enumerate(classes): if "class" not in v: - raise ValueError("\"class\" attribute is empty") + raise ValueError('"class" attribute is empty') if isinstance(v["class"], str): cls = locate(v["class"]) classes[i]["class"] = cls diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 9540b050a..3f5d38625 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -399,7 +399,7 @@ def lookahead(self) -> int: @property def timeseries_generator(self): - return self.kwargs.get('timeseries_generator', None) + return self.kwargs.get("timeseries_generator", None) def get_metadata(self): """ @@ -735,7 +735,7 @@ def __init__( length: int, batch_size: int = 128, shuffle: bool = False, - step: Union[pd.Timedelta, str] = '10min', + step: Union[pd.Timedelta, str] = "10min", ): if not isinstance(data, pd.DataFrame): raise ValueError("Data have to be instance of pandas.DataFrame") diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py index 6d763fe79..efd32ac64 100644 --- a/tests/gordo/machine/model/test_gordo_timeseries_generator.py +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -56,7 +56,7 @@ def test_find_consecutive_chunks(): ("2018-02-03", 20), ) test1_df = get_test_df(test1_time_intervals) - gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step='60min') + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step="60min") expected_chunks = [ TimeseriesChunk( start_ts=pd.Timestamp("2018-01-01 00:00:00"), @@ -96,7 +96,7 @@ def test_create_generator_containers(): ("2018-01-04", 10), ) test1_df = get_test_df(test1_time_intervals) - gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step='60min') + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step="60min") expected_generator_containers = [ { "chunk": TimeseriesChunk( @@ -138,7 +138,9 @@ def test_timeseries_generator(): ("2018-01-04", 10), ) test1_df = get_test_df(test1_time_intervals, generator=range_gen(), tags_count=1) - gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, batch_size=3, step='60min') + gen = GordoTimeseriesGenerator( + test1_df, test1_df, length=5, batch_size=3, step="60min" + ) assert len(gen.generators_containers) == 2 assert len(gen) == 6 x, y = gen[0] @@ -160,4 +162,4 @@ def test_too_short_timeseries_length(): ) test1_df = get_test_df(test1_time_intervals) with pytest.raises(ValueError): - GordoTimeseriesGenerator(test1_df, test1_df, length=10, step='60min') + GordoTimeseriesGenerator(test1_df, test1_df, length=10, step="60min") diff --git a/tests/gordo/machine/model/test_model.py b/tests/gordo/machine/model/test_model.py index dae3028c8..4d639e4c7 100644 --- a/tests/gordo/machine/model/test_model.py +++ b/tests/gordo/machine/model/test_model.py @@ -334,4 +334,5 @@ def test_lstmae_predict_output(): out = model.predict(xTest) assert out.shape == (2, 3) -#TODO test with GordoTimeseriesGenerator \ No newline at end of file + +# TODO test with GordoTimeseriesGenerator From 7e3adb39f75435b595b42f8d4df160802fe9a139 Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 14:04:51 +0300 Subject: [PATCH 25/61] Deal with DataFrameMapper.__set_state__() and DataFrameMapper.__get_state__() --- gordo/machine/model/data_frame_mapper.py | 43 ++++++++++++++++++------ 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/machine/model/data_frame_mapper.py index dd74be58c..8db9d2d04 100644 --- a/gordo/machine/model/data_frame_mapper.py +++ b/gordo/machine/model/data_frame_mapper.py @@ -1,8 +1,12 @@ +import logging + from pydoc import locate from sklearn_pandas import DataFrameMapper, gen_features from copy import copy from typing import List, Union, Optional +logger = logging.getLogger(__name__) + class DataFrameMapper(DataFrameMapper): @@ -14,19 +18,36 @@ def __init__( classes: Optional[List[dict]] = None, **kwargs ): - if classes is not None: - classes = copy(classes) - self._prepare_classes(classes) - features = gen_features(columns=columns, classes=classes) + self.columns = columns + self.classes = classes + features = self._build_features(columns, classes) base_kwargs = copy(self._default_kwargs) base_kwargs.update(kwargs) super().__init__(features=features, **base_kwargs) @staticmethod - def _prepare_classes(classes: List[dict]): - for i, v in enumerate(classes): - if "class" not in v: - raise ValueError('"class" attribute is empty') - if isinstance(v["class"], str): - cls = locate(v["class"]) - classes[i]["class"] = cls + def _build_features( + columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, + ): + if classes is not None: + classes = copy(classes) + for i, v in enumerate(classes): + if "class" not in v: + raise ValueError('"class" attribute is empty') + if isinstance(v["class"], str): + cls = locate(v["class"]) + classes[i]["class"] = cls + logger.debug("_build_features for columns=%s, classes=%s)", columns, classes) + return gen_features(columns=columns, classes=classes) + + def __getstate__(self): + state = super().__getstate__() + state["columns"] = self.columns + state["classes"] = self.classes + del state["features"] + return state + + def __setstate__(self, state): + features = self._build_features(state.get("columns"), state.get("classes")) + state["features"] = features + super().__setstate__(state) From 38db5bdfebafdefda084f29bb51612a3d766342f Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 14:17:02 +0300 Subject: [PATCH 26/61] Some small fixes for DataFrameMapper --- gordo/machine/model/__init__.py | 1 + gordo/machine/model/data_frame_mapper.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/gordo/machine/model/__init__.py b/gordo/machine/model/__init__.py index e69de29bb..fc2c16453 100644 --- a/gordo/machine/model/__init__.py +++ b/gordo/machine/model/__init__.py @@ -0,0 +1 @@ +from .data_frame_mapper import DataFrameMapper \ No newline at end of file diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/machine/model/data_frame_mapper.py index 8db9d2d04..6b9168b1e 100644 --- a/gordo/machine/model/data_frame_mapper.py +++ b/gordo/machine/model/data_frame_mapper.py @@ -2,7 +2,7 @@ from pydoc import locate from sklearn_pandas import DataFrameMapper, gen_features -from copy import copy +from copy import copy, deepcopy from typing import List, Union, Optional logger = logging.getLogger(__name__) @@ -30,7 +30,7 @@ def _build_features( columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, ): if classes is not None: - classes = copy(classes) + classes = deepcopy(classes) for i, v in enumerate(classes): if "class" not in v: raise ValueError('"class" attribute is empty') From b8e9007070ce934f6c47b089ed19c852b91bdaaa Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 15:25:46 +0300 Subject: [PATCH 27/61] black --- gordo/machine/model/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gordo/machine/model/__init__.py b/gordo/machine/model/__init__.py index fc2c16453..bb2321894 100644 --- a/gordo/machine/model/__init__.py +++ b/gordo/machine/model/__init__.py @@ -1 +1 @@ -from .data_frame_mapper import DataFrameMapper \ No newline at end of file +from .data_frame_mapper import DataFrameMapper From 3820b00efc61b5fda4c2f48484a426209cc15a1b Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 18:22:26 +0300 Subject: [PATCH 28/61] Fix tests. Add lookahead for TimeseriesGenerator --- gordo/machine/model/data_frame_mapper.py | 8 +++---- gordo/machine/model/models.py | 29 +++++++++++++++++++++--- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/machine/model/data_frame_mapper.py index 6b9168b1e..53b642c31 100644 --- a/gordo/machine/model/data_frame_mapper.py +++ b/gordo/machine/model/data_frame_mapper.py @@ -1,14 +1,14 @@ import logging +import sklearn_pandas from pydoc import locate -from sklearn_pandas import DataFrameMapper, gen_features from copy import copy, deepcopy from typing import List, Union, Optional logger = logging.getLogger(__name__) -class DataFrameMapper(DataFrameMapper): +class DataFrameMapper(sklearn_pandas.DataFrameMapper): _default_kwargs = {"df_out": True} @@ -37,8 +37,8 @@ def _build_features( if isinstance(v["class"], str): cls = locate(v["class"]) classes[i]["class"] = cls - logger.debug("_build_features for columns=%s, classes=%s)", columns, classes) - return gen_features(columns=columns, classes=classes) + logger.debug("_build_features for columns=%s, classes=%s", columns, classes) + return sklearn_pandas.gen_features(columns=columns, classes=classes) def __getstate__(self): state = super().__getstate__() diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 3f5d38625..05cf500b5 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -668,9 +668,13 @@ def create_keras_timeseriesgenerator( >>> len(gen[0][0][0][0]) # n_features = 2 2 """ - X, y = pad_x_and_y(X, y, lookahead) return timeseries_generators.create_from_config( - config, data=X, targets=y, length=lookback_window, batch_size=batch_size + config, + data=X, + targets=y, + length=lookback_window, + batch_size=batch_size, + lookahead=lookahead, ) @@ -709,7 +713,23 @@ def wrap(cls): return wrap -timeseries_generators = TimeseriesGeneratorTypes(default_type=TimeseriesGenerator) +class DefaultTimeseriesGenertor(TimeseriesGenerator): + def __init__( + self, + data: Union[pd.DataFrame, np.ndarray], + targets: Union[pd.DataFrame, np.ndarray], + lookahead: int = 1, + **kwargs, + ): + if isinstance(data, pd.DataFrame): + data = data.values + if isinstance(targets, pd.DataFrame): + targets = targets.values + data, targets = pad_x_and_y(data, targets, lookahead) + super().__init__(data=data, targets=targets, **kwargs) + + +timeseries_generators = TimeseriesGeneratorTypes(default_type=DefaultTimeseriesGenertor) @dataclass @@ -736,6 +756,7 @@ def __init__( batch_size: int = 128, shuffle: bool = False, step: Union[pd.Timedelta, str] = "10min", + lookahead: int = 1, ): if not isinstance(data, pd.DataFrame): raise ValueError("Data have to be instance of pandas.DataFrame") @@ -769,6 +790,8 @@ def __init__( "Seems like the time series are too small or in random order." "Failed chunks: %s" % self.consecutive_chunks ) + # TODO use lookahead + self.lookahead = lookahead def filter_chunks(self, indexes=None): if indexes is not None: From 19a7faefa0efeabd625a6ab5f9cd166931aaf882 Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 18:23:43 +0300 Subject: [PATCH 29/61] typo --- gordo/machine/model/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 05cf500b5..d8bc9a67c 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -713,7 +713,7 @@ def wrap(cls): return wrap -class DefaultTimeseriesGenertor(TimeseriesGenerator): +class DefaultTimeseriesGenerator(TimeseriesGenerator): def __init__( self, data: Union[pd.DataFrame, np.ndarray], @@ -729,7 +729,7 @@ def __init__( super().__init__(data=data, targets=targets, **kwargs) -timeseries_generators = TimeseriesGeneratorTypes(default_type=DefaultTimeseriesGenertor) +timeseries_generators = TimeseriesGeneratorTypes(default_type=DefaultTimeseriesGenerator) @dataclass From 7ccb8e028a2d584666db15f23afa793d1b0f198b Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 18:37:38 +0300 Subject: [PATCH 30/61] black --- gordo/machine/model/models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index d8bc9a67c..b9b02daed 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -729,7 +729,9 @@ def __init__( super().__init__(data=data, targets=targets, **kwargs) -timeseries_generators = TimeseriesGeneratorTypes(default_type=DefaultTimeseriesGenerator) +timeseries_generators = TimeseriesGeneratorTypes( + default_type=DefaultTimeseriesGenerator +) @dataclass From 57a772434de70db31a6d20e6e12c2eb24564351f Mon Sep 17 00:00:00 2001 From: user Date: Wed, 15 Apr 2020 16:57:37 +0300 Subject: [PATCH 31/61] Steel not working properly --- .../__init__.py} | 25 ++++++++------- gordo/machine/model/__init__.py | 1 - gordo/machine/validators.py | 5 ++- gordo/serializer/__init__.py | 5 +++ gordo/serializer/from_definition.py | 32 +++++++++++++++++-- 5 files changed, 52 insertions(+), 16 deletions(-) rename gordo/{machine/model/data_frame_mapper.py => data_frame_mapper/__init__.py} (69%) diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/data_frame_mapper/__init__.py similarity index 69% rename from gordo/machine/model/data_frame_mapper.py rename to gordo/data_frame_mapper/__init__.py index 53b642c31..73e7bcda9 100644 --- a/gordo/machine/model/data_frame_mapper.py +++ b/gordo/data_frame_mapper/__init__.py @@ -9,14 +9,13 @@ class DataFrameMapper(sklearn_pandas.DataFrameMapper): - _default_kwargs = {"df_out": True} def __init__( - self, - columns: List[Union[str, List[str]]], - classes: Optional[List[dict]] = None, - **kwargs + self, + columns: List[Union[str, List[str]]], + classes: Optional[List[dict]] = None, + **kwargs ): self.columns = columns self.classes = classes @@ -27,16 +26,17 @@ def __init__( @staticmethod def _build_features( - columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, + columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, ): if classes is not None: classes = deepcopy(classes) for i, v in enumerate(classes): - if "class" not in v: - raise ValueError('"class" attribute is empty') - if isinstance(v["class"], str): - cls = locate(v["class"]) - classes[i]["class"] = cls + if isinstance(v, dict): + if "class" not in v: + raise ValueError('"class" attribute is empty') + if isinstance(v["class"], str): + cls = locate(v["class"]) + classes[i]["class"] = cls logger.debug("_build_features for columns=%s, classes=%s", columns, classes) return sklearn_pandas.gen_features(columns=columns, classes=classes) @@ -51,3 +51,6 @@ def __setstate__(self, state): features = self._build_features(state.get("columns"), state.get("classes")) state["features"] = features super().__setstate__(state) + + +__all__ = ['DataFrameMapper'] diff --git a/gordo/machine/model/__init__.py b/gordo/machine/model/__init__.py index bb2321894..e69de29bb 100644 --- a/gordo/machine/model/__init__.py +++ b/gordo/machine/model/__init__.py @@ -1 +0,0 @@ -from .data_frame_mapper import DataFrameMapper diff --git a/gordo/machine/validators.py b/gordo/machine/validators.py index f4c814730..69d0e7101 100644 --- a/gordo/machine/validators.py +++ b/gordo/machine/validators.py @@ -8,12 +8,14 @@ import dateutil.parser import logging -from gordo.serializer import from_definition +from gordo.serializer.from_definition import from_definition from gordo.machine.dataset.sensor_tag import SensorTag logger = logging.getLogger(__name__) +logger.debug("from_definition1=%s", from_definition) + class BaseDescriptor: """ @@ -85,6 +87,7 @@ class ValidModel(BaseDescriptor): def __set__(self, instance, value): if getattr(instance, "_strict", True): try: + logger.debug("from_definition=%s", from_definition) from_definition(value) except Exception as e: raise ValueError(f"Pipeline from definition failed: {e}") diff --git a/gordo/serializer/__init__.py b/gordo/serializer/__init__.py index b0af3fde2..2779d06d5 100644 --- a/gordo/serializer/__init__.py +++ b/gordo/serializer/__init__.py @@ -1,3 +1,8 @@ +import logging from .from_definition import from_definition +logger = logging.getLogger(__name__) +logger.debug("__init__=%s", from_definition) from .into_definition import into_definition from .serializer import dump, dumps, load, loads, load_metadata + +__all__=['from_definition', 'into_definition', 'dump', 'dumps', 'load', 'loads', 'load_metadata'] diff --git a/gordo/serializer/from_definition.py b/gordo/serializer/from_definition.py index 54619ff25..6ef4c2c27 100644 --- a/gordo/serializer/from_definition.py +++ b/gordo/serializer/from_definition.py @@ -4,11 +4,13 @@ import pydoc import copy import typing # noqa -from typing import Union, Dict, Any, Iterable +from typing import Union, Dict, Any, Iterable, Type, Optional from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.base import BaseEstimator from tensorflow.keras.models import Sequential +from gordo.data_frame_mapper import DataFrameMapper + logger = logging.getLogger(__name__) @@ -62,7 +64,7 @@ def from_definition( def _build_branch( definition: Iterable[Union[str, Dict[Any, Any]]], - constructor_class=Union[Pipeline, None], + constructor_class: Optional[Type[Pipeline]] = None, ): """ Builds a branch of the tree and optionally constructs the class with the given @@ -169,6 +171,11 @@ def _build_step( f"Got {StepClass} but the supplied parameters" f"seem invalid: {params}" ) + + if issubclass(StepClass, DataFrameMapper): + params = _load_data_mapper_params(params) + + logger.debug("StopClass(%s)", params) return StepClass(**params) # If step is just a string, can initialize it without any params @@ -183,6 +190,16 @@ def _build_step( ) +def _load_data_mapper_params(params: dict): + if "classes" in params: + classes = copy.deepcopy(params["classes"]) + if not isinstance(classes, list): + raise TypeError('"classes" should be a list') + logger.debug("classes=%s", classes) + params["classes"] = _build_branch(classes) + return params + + def _load_param_classes(params: dict): """ Inspect the params' values and determine if any can be loaded as a class. @@ -233,6 +250,7 @@ def _load_param_classes(params: dict): objects """ params = copy.copy(params) + logger.debug("_load_param_classes=%s", params) for key, value in params.items(): # If value is a simple string, try to load the model/class @@ -263,6 +281,14 @@ def _load_param_classes(params: dict): else: # Call this func again, incase there is nested occurances of this problem in these kwargs sub_params = value[list(value.keys())[0]] - kwargs = _load_param_classes(sub_params) + + if issubclass(Model, DataFrameMapper): + kwargs = _load_data_mapper_params(sub_params) + logger.debug( + "_load_data_mapper_params(%s)=%s", sub_params, kwargs + ) + else: + kwargs = _load_param_classes(sub_params) + params[key] = Model(**kwargs) # type: ignore return params From 9856e9c210e5135d108a81fcb0322a359b9d86f2 Mon Sep 17 00:00:00 2001 From: user Date: Wed, 15 Apr 2020 17:48:47 +0300 Subject: [PATCH 32/61] Works for simples cases --- gordo/data_frame_mapper/__init__.py | 33 ++++++++++++----------------- gordo/serializer/from_definition.py | 10 ++++----- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/gordo/data_frame_mapper/__init__.py b/gordo/data_frame_mapper/__init__.py index 73e7bcda9..c6b512840 100644 --- a/gordo/data_frame_mapper/__init__.py +++ b/gordo/data_frame_mapper/__init__.py @@ -1,9 +1,9 @@ import logging import sklearn_pandas -from pydoc import locate -from copy import copy, deepcopy -from typing import List, Union, Optional +from copy import copy +from sklearn.base import BaseEstimator +from typing import List, Union logger = logging.getLogger(__name__) @@ -14,41 +14,34 @@ class DataFrameMapper(sklearn_pandas.DataFrameMapper): def __init__( self, columns: List[Union[str, List[str]]], - classes: Optional[List[dict]] = None, + transformers: List[BaseEstimator] = None, **kwargs ): self.columns = columns - self.classes = classes - features = self._build_features(columns, classes) + self.transformers = transformers + features = self._build_features(columns, transformers) base_kwargs = copy(self._default_kwargs) base_kwargs.update(kwargs) super().__init__(features=features, **base_kwargs) @staticmethod def _build_features( - columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, + columns: List[Union[str, List[str]]], transformers: List[BaseEstimator], ): - if classes is not None: - classes = deepcopy(classes) - for i, v in enumerate(classes): - if isinstance(v, dict): - if "class" not in v: - raise ValueError('"class" attribute is empty') - if isinstance(v["class"], str): - cls = locate(v["class"]) - classes[i]["class"] = cls - logger.debug("_build_features for columns=%s, classes=%s", columns, classes) - return sklearn_pandas.gen_features(columns=columns, classes=classes) + features = [] + for column in columns: + features.append((column, transformers)) + return features def __getstate__(self): state = super().__getstate__() state["columns"] = self.columns - state["classes"] = self.classes + state["transformers"] = self.transformers del state["features"] return state def __setstate__(self, state): - features = self._build_features(state.get("columns"), state.get("classes")) + features = self._build_features(state.get("columns"), state.get("transformers")) state["features"] = features super().__setstate__(state) diff --git a/gordo/serializer/from_definition.py b/gordo/serializer/from_definition.py index 6ef4c2c27..d75d9d74b 100644 --- a/gordo/serializer/from_definition.py +++ b/gordo/serializer/from_definition.py @@ -191,12 +191,12 @@ def _build_step( def _load_data_mapper_params(params: dict): - if "classes" in params: - classes = copy.deepcopy(params["classes"]) + if "transformers" in params: + classes = copy.deepcopy(params["transformers"]) if not isinstance(classes, list): - raise TypeError('"classes" should be a list') - logger.debug("classes=%s", classes) - params["classes"] = _build_branch(classes) + raise TypeError('"transformers" should be a list') + logger.debug("transformers=%s", classes) + params["transformers"] = _build_branch(classes) return params From 0da2f890fce03c426e4c4ebccd74de222ddb6b07 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 2 Apr 2020 08:53:40 +0300 Subject: [PATCH 33/61] GordoTimeseriesGenerator first version --- gordo/machine/model/models.py | 89 +++++++++++++++++++++++++---------- 1 file changed, 63 insertions(+), 26 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 117fec2c3..c316b5a4b 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -580,10 +580,8 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": """ - X = X.values if isinstance(X, pd.DataFrame) else X - y = y.values if isinstance(y, pd.DataFrame) else y - - X = self._validate_and_fix_size_of_X(X) + if not isinstance(X, pd.DataFrame): + X = self._validate_and_fix_size_of_X(X) # We call super.fit on a single sample (notice the batch_size=1) to initiate the # model using the scikit-learn wrapper. @@ -715,13 +713,34 @@ def lookahead(self) -> int: return 0 +def pad_x_and_y( + X: np.ndarray, y: np.ndarray, lookahead: int +) -> Tuple[np.ndarray, np.ndarray]: + new_length = len(X) + 1 - lookahead + if lookahead == 1: + return X, y + elif lookahead >= 0: + pad_kw = dict(maxlen=new_length, dtype=X.dtype) + + if lookahead == 0: + X = pad_sequences([X], padding="post", **pad_kw)[0] + y = pad_sequences([y], padding="pre", **pad_kw)[0] + + elif lookahead > 1: + X = pad_sequences([X], padding="post", truncating="post", **pad_kw)[0] + y = pad_sequences([y], padding="pre", truncating="pre", **pad_kw)[0] + return X, y + else: + raise ValueError(f"Value of `lookahead` can not be negative, is {lookahead}") + + def create_keras_timeseriesgenerator( - X: np.ndarray, - y: Optional[np.ndarray], + X: Union[pd.DataFrame, np.ndarray], + y: Optional[Union[pd.DataFrame, np.ndarray]], batch_size: int, lookback_window: int, lookahead: int, -) -> tensorflow.keras.preprocessing.sequence.TimeseriesGenerator: +) -> object: """ Provides a `keras.preprocessing.sequence.TimeseriesGenerator` for use with LSTM's, but with the added ability to specify the lookahead of the target in y. @@ -773,27 +792,45 @@ def create_keras_timeseriesgenerator( >>> len(gen[0][0][0][0]) # n_features = 2 2 """ - new_length = len(X) + 1 - lookahead - kwargs: Dict[str, Any] = dict(length=lookback_window, batch_size=batch_size) - if lookahead == 1: - kwargs.update(dict(data=X, targets=y)) - elif lookahead >= 0: + if isinstance(X, pd.DataFrame): + if not isinstance(y, pd.DataFrame): + raise ValueError("'y' should be instance of pandas.DataFrame") + return GordoTimeseriesGenerator( + data=X, targets=y, length=lookback_window, batch_size=batch_size + ) + else: + X, y = pad_x_and_y(X, y, lookahead) + return TimeseriesGenerator( + data=X, targets=y, length=lookback_window, batch_size=batch_size + ) - pad_kw = dict(maxlen=new_length, dtype=X.dtype) - if lookahead == 0: - kwargs["data"] = pad_sequences([X], padding="post", **pad_kw)[0] - kwargs["targets"] = pad_sequences([y], padding="pre", **pad_kw)[0] +class GordoTimeseriesGenerator(object): + def __init__( + self, data: pd.DataFrame, targets: pd.DataFrame, length: int, batch_size=128 + ): - elif lookahead > 1: - kwargs["data"] = pad_sequences( - [X], padding="post", truncating="post", **pad_kw - )[0] - kwargs["targets"] = pad_sequences( - [y], padding="pre", truncating="pre", **pad_kw - )[0] - else: - raise ValueError(f"Value of `lookahead` can not be negative, is {lookahead}") + if len(data) != len(targets): + raise ValueError( + "Data and targets have to be" + " of same length. " + "Data length is {}".format(len(data)) + + " while target length is {}".format(len(targets)) + ) + + self.data = data + self.targets = targets + self.length = length + self.batch_size = batch_size + + def __len__(self): + return (len(self.data) - 1 + self.batch_size) // self.batch_size + + def __getitem__(self, index): + i = self.batch_size * index + rows = np.arange(i, min(i + self.batch_size, len(self.data)), 1) + + samples = np.array([self.data[row - self.length : row : 1] for row in rows]) + targets = np.array([self.targets[row] for row in rows]) - return TimeseriesGenerator(**kwargs) + return samples, targets From 2d0ba043badc623a28072d1d01b9f1754605cbdc Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 2 Apr 2020 11:01:50 +0300 Subject: [PATCH 34/61] First approach for GordoTimeseriesGenerator --- gordo/machine/model/models.py | 59 ++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index c316b5a4b..9705d9cfa 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -795,7 +795,7 @@ def create_keras_timeseriesgenerator( if isinstance(X, pd.DataFrame): if not isinstance(y, pd.DataFrame): - raise ValueError("'y' should be instance of pandas.DataFrame") + raise ValueError("'y' should be an instance of pandas.DataFrame") return GordoTimeseriesGenerator( data=X, targets=y, length=lookback_window, batch_size=batch_size ) @@ -808,7 +808,12 @@ def create_keras_timeseriesgenerator( class GordoTimeseriesGenerator(object): def __init__( - self, data: pd.DataFrame, targets: pd.DataFrame, length: int, batch_size=128 + self, + data: pd.DataFrame, + targets: pd.DataFrame, + length: int, + batch_size=128, + step: Optional[pd.Timedelta] = None, ): if len(data) != len(targets): @@ -822,15 +827,55 @@ def __init__( self.targets = targets self.length = length self.batch_size = batch_size + if step is None: + step = pd.Timedelta(minutes=10) + self.step = step + self.time_batch_size = step * batch_size def __len__(self): return (len(self.data) - 1 + self.batch_size) // self.batch_size + def split_consecutive( + self, df: pd.DataFrame + ) -> Tuple[pd.DataFrame, Optional[pd.Timestamp]]: + prev_date = None + start_date = None + for dt in df.index: + if prev_date is None: + prev_date = dt + start_date = dt + else: + if dt - prev_date != self.step: + return df.loc[start_date:prev_date], dt + prev_date = dt + return df, None + def __getitem__(self, index): i = self.batch_size * index - rows = np.arange(i, min(i + self.batch_size, len(self.data)), 1) - samples = np.array([self.data[row - self.length : row : 1] for row in rows]) - targets = np.array([self.targets[row] for row in rows]) - - return samples, targets + index = self.data.index + + samples = [] + current_date = index.min() + while True: + batch = self.data[current_date : current_date + self.time_batch_size] + if batch.empty: + break + if len(batch) == self.batch_size: + samples.append(batch.values) + current_date += self.step + else: + batch, last_date = self.split_consecutive(batch) + batch_values = batch.values + if last_date is not None: + current_date = last_date + batch_values = pad_sequences( + [batch_values], padding="post", truncating="post", maxlen=batch + )[0] + else: + current_date += self.step + samples.append(batch_values) + + targets = np.array([self.targets[row] for row in range(len(samples))]) + + return np.array(samples), np.array(targets) From f12a99935bff35619e5e4f1ac8b48d82bf44a48e Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 2 Apr 2020 11:14:59 +0300 Subject: [PATCH 35/61] Always pass pd.DataFrame for create_keras_timeseriesgenerator() --- gordo/machine/model/models.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 9705d9cfa..f6cb29637 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -582,6 +582,8 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": if not isinstance(X, pd.DataFrame): X = self._validate_and_fix_size_of_X(X) + else: + pass #TODO # We call super.fit on a single sample (notice the batch_size=1) to initiate the # model using the scikit-learn wrapper. @@ -653,9 +655,11 @@ def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: >>> model_transform.shape (2, 2) """ - X = X.values if isinstance(X, pd.DataFrame) else X + if not isinstance(X, pd.DataFrame): + X = self._validate_and_fix_size_of_X(X) + else: + pass #TODO - X = self._validate_and_fix_size_of_X(X) tsg = create_keras_timeseriesgenerator( X=X, y=X, @@ -796,6 +800,7 @@ def create_keras_timeseriesgenerator( if isinstance(X, pd.DataFrame): if not isinstance(y, pd.DataFrame): raise ValueError("'y' should be an instance of pandas.DataFrame") + #TODO padding for X and y return GordoTimeseriesGenerator( data=X, targets=y, length=lookback_window, batch_size=batch_size ) From 2ea84c5c60bdb99481dabf2da99b340ca984be13 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 2 Apr 2020 11:35:42 +0300 Subject: [PATCH 36/61] Fix targets in GordoTimeseriesGenerator --- gordo/machine/model/models.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index f6cb29637..6df88b034 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -583,7 +583,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": if not isinstance(X, pd.DataFrame): X = self._validate_and_fix_size_of_X(X) else: - pass #TODO + pass # TODO # We call super.fit on a single sample (notice the batch_size=1) to initiate the # model using the scikit-learn wrapper. @@ -658,7 +658,7 @@ def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: if not isinstance(X, pd.DataFrame): X = self._validate_and_fix_size_of_X(X) else: - pass #TODO + pass # TODO tsg = create_keras_timeseriesgenerator( X=X, @@ -800,7 +800,7 @@ def create_keras_timeseriesgenerator( if isinstance(X, pd.DataFrame): if not isinstance(y, pd.DataFrame): raise ValueError("'y' should be an instance of pandas.DataFrame") - #TODO padding for X and y + # TODO padding for X and y return GordoTimeseriesGenerator( data=X, targets=y, length=lookback_window, batch_size=batch_size ) @@ -817,7 +817,7 @@ def __init__( data: pd.DataFrame, targets: pd.DataFrame, length: int, - batch_size=128, + batch_size: int = 128, step: Optional[pd.Timedelta] = None, ): @@ -856,16 +856,17 @@ def split_consecutive( return df, None def __getitem__(self, index): - i = self.batch_size * index - - index = self.data.index + data = self.data + index = data.index samples = [] + rows = [] current_date = index.min() while True: - batch = self.data[current_date : current_date + self.time_batch_size] + batch = data.loc[current_date : current_date + self.time_batch_size] if batch.empty: break + rows.append(index.get_loc(current_date)) if len(batch) == self.batch_size: samples.append(batch.values) current_date += self.step @@ -881,6 +882,6 @@ def __getitem__(self, index): current_date += self.step samples.append(batch_values) - targets = np.array([self.targets[row] for row in range(len(samples))]) + targets = np.array([self.targets[row] for row in rows]) return np.array(samples), np.array(targets) From beeff760fab2a34dd9500d8e0c613425378511e4 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 2 Apr 2020 12:06:53 +0300 Subject: [PATCH 37/61] Fix github checks --- gordo/machine/model/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 6df88b034..d8a3f71ee 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -744,7 +744,7 @@ def create_keras_timeseriesgenerator( batch_size: int, lookback_window: int, lookahead: int, -) -> object: +) -> TimeseriesGenerator: """ Provides a `keras.preprocessing.sequence.TimeseriesGenerator` for use with LSTM's, but with the added ability to specify the lookahead of the target in y. From 235c0bfc33f8b09e63a9e28dd8db1bc519d7e94a Mon Sep 17 00:00:00 2001 From: Serhii Date: Tue, 7 Apr 2020 14:25:10 +0300 Subject: [PATCH 38/61] Do GordoTimeseriesGenerator in different way --- gordo/machine/model/models.py | 137 +++++++++++++++++++++------------- 1 file changed, 86 insertions(+), 51 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index d8a3f71ee..b3e26d917 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -9,12 +9,14 @@ from abc import ABCMeta from copy import copy, deepcopy from importlib.util import find_spec +from dataclasses import dataclass import h5py import tensorflow.keras.models from tensorflow.keras.models import load_model, save_model from tensorflow.keras.preprocessing.sequence import pad_sequences, TimeseriesGenerator from tensorflow.keras.wrappers.scikit_learn import KerasRegressor as BaseWrapper +from tensorflow.python.keras.utils import data_utils import numpy as np import pandas as pd import xarray as xr @@ -811,77 +813,110 @@ def create_keras_timeseriesgenerator( ) -class GordoTimeseriesGenerator(object): +@dataclass +class TimeseriesChunk: + start_ts: pd.Timestamp + end_ts: pd.Timestamp + size: int + + +@dataclass +class TimeseriesGeneratorContainer: + generator: TimeseriesGenerator + chunk: TimeseriesChunk + length: int + + +class GordoTimeseriesGenerator(data_utils.Sequence): def __init__( self, data: pd.DataFrame, targets: pd.DataFrame, length: int, batch_size: int = 128, - step: Optional[pd.Timedelta] = None, + shuffle: bool = False, + step: Optional[Union[pd.Timedelta, int]] = None, ): if len(data) != len(targets): raise ValueError( - "Data and targets have to be" + " of same length. " - "Data length is {}".format(len(data)) - + " while target length is {}".format(len(targets)) + "Data and targets have to be of same length. " + f"Data length is {len(data)}" + f" while target length is {len(targets)}" ) - self.data = data - self.targets = targets - self.length = length - self.batch_size = batch_size if step is None: step = pd.Timedelta(minutes=10) + if isinstance(step, int): + step = pd.Timedelta(minutes=step) self.step = step - self.time_batch_size = step * batch_size + self.consecutive_chunks = self.find_consecutive_chunks(data) + self.failed_chunks = [] + self.generators_containers = self.create_generator_containers( + data, targets, length=length, batch_size=batch_size, shuffle=shuffle + ) + + def filter_chunks(self, indexes=None): + if indexes is not None: + self.generators_containers = [ + self.generators_containers[i] for i in indexes + ] def __len__(self): - return (len(self.data) - 1 + self.batch_size) // self.batch_size + return sum(container.length for container in self.generators_containers) - def split_consecutive( - self, df: pd.DataFrame - ) -> Tuple[pd.DataFrame, Optional[pd.Timestamp]]: - prev_date = None - start_date = None + def find_consecutive_chunks(self, df: pd.DataFrame) -> List[TimeseriesChunk]: + chunks = [] + prev_ts, start_ts, size = None, None, 0 for dt in df.index: - if prev_date is None: - prev_date = dt - start_date = dt + if prev_ts is None: + prev_ts = dt + start_ts = dt else: - if dt - prev_date != self.step: - return df.loc[start_date:prev_date], dt - prev_date = dt - return df, None - - def __getitem__(self, index): - data = self.data - index = data.index - - samples = [] - rows = [] - current_date = index.min() - while True: - batch = data.loc[current_date : current_date + self.time_batch_size] - if batch.empty: - break - rows.append(index.get_loc(current_date)) - if len(batch) == self.batch_size: - samples.append(batch.values) - current_date += self.step - else: - batch, last_date = self.split_consecutive(batch) - batch_values = batch.values - if last_date is not None: - current_date = last_date - batch_values = pad_sequences( - [batch_values], padding="post", truncating="post", maxlen=batch - )[0] + if dt - prev_ts == self.step: + size += 1 + prev_ts = dt else: - current_date += self.step - samples.append(batch_values) + chunks.append(TimeseriesChunk(start_ts, prev_ts, size)) + prev_ts, start_ts, size = None, None, 0 + if start_ts is not None: + chunks.append(TimeseriesChunk(start_ts, prev_ts, size)) + return chunks - targets = np.array([self.targets[row] for row in rows]) + def create_generator_containers( + self, + data: pd.DataFrame, + targets: pd.DataFrame, + length: int, + batch_size: int, + shuffle: bool, + ) -> List[TimeseriesGeneratorContainer]: + generator_containers = [] + for chunk in self.consecutive_chunks: + gen_data = data[chunk.start_ts : chunk.end_ts].values + gen_target = targets[chunk.start_ts : chunk.end_ts].values + try: + generator = TimeseriesGenerator( + gen_data, + gen_target, + length=length, + batch_size=batch_size, + shuffle=shuffle, + ) + except ValueError: + self.failed_chunks.append(chunk) + length = len(generator) + generator_containers.append( + TimeseriesGeneratorContainer(generator, chunk, length) + ) + return generator_containers - return np.array(samples), np.array(targets) + def __getitem__(self, index): + i = -1 + for container in self.generators_containers: + new_i = i + container.length + if index <= new_i: + gen_i = index - i - 1 + return container.generator[gen_i] + i = new_i + raise IndexError(index) From 2344f3885e474138c9369d90f8dd04417e968c9f Mon Sep 17 00:00:00 2001 From: Serhii Date: Wed, 8 Apr 2020 09:36:11 +0300 Subject: [PATCH 39/61] Fix annotation checks on Github --- gordo/machine/model/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index b3e26d917..00ef39218 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -851,7 +851,7 @@ def __init__( step = pd.Timedelta(minutes=step) self.step = step self.consecutive_chunks = self.find_consecutive_chunks(data) - self.failed_chunks = [] + self.failed_chunks: List[TimeseriesChunk] = [] self.generators_containers = self.create_generator_containers( data, targets, length=length, batch_size=batch_size, shuffle=shuffle ) From 663df7437c41f7a9f9e931603588937f45d3b767 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 09:19:33 +0300 Subject: [PATCH 40/61] logger.debug() in GordoTimeseriesGenerator --- gordo/machine/model/models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 00ef39218..475e168f9 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -851,10 +851,12 @@ def __init__( step = pd.Timedelta(minutes=step) self.step = step self.consecutive_chunks = self.find_consecutive_chunks(data) + logger.debug('GordoTimeseriesGenerator with consecutive_chunks=%s', self.consecutive_chunks) self.failed_chunks: List[TimeseriesChunk] = [] self.generators_containers = self.create_generator_containers( data, targets, length=length, batch_size=batch_size, shuffle=shuffle ) + logger.debug('GordoTimeseriesGenerator with generators_containers=%s', self.generators_containers) def filter_chunks(self, indexes=None): if indexes is not None: From 7dc56a6d11c05033b471beeee92ded85f4cee4b2 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 09:32:58 +0300 Subject: [PATCH 41/61] raise ValueError if the time series in wrong shape --- gordo/machine/model/models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 475e168f9..5f7d24d52 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -857,6 +857,8 @@ def __init__( data, targets, length=length, batch_size=batch_size, shuffle=shuffle ) logger.debug('GordoTimeseriesGenerator with generators_containers=%s', self.generators_containers) + if not self.generators_containers: + raise ValueError("Seems like the time series are too small or in random order") def filter_chunks(self, indexes=None): if indexes is not None: From a0dd2e99a01d3cd08e13c13b61924f60a8ccc467 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 15:01:07 +0300 Subject: [PATCH 42/61] test_find_consecutive_chunks() --- gordo/machine/model/models.py | 24 ++++----- .../model/test_gordo_timeseries_generator.py | 49 +++++++++++++++++++ 2 files changed, 62 insertions(+), 11 deletions(-) create mode 100644 tests/gordo/machine/model/test_gordo_timeseries_generator.py diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 5f7d24d52..71c87576b 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -858,7 +858,8 @@ def __init__( ) logger.debug('GordoTimeseriesGenerator with generators_containers=%s', self.generators_containers) if not self.generators_containers: - raise ValueError("Seems like the time series are too small or in random order") + raise ValueError("Seems like the time series are too small or in random order." + "Failed chunks: %s" % self.consecutive_chunks) def filter_chunks(self, indexes=None): if indexes is not None: @@ -871,20 +872,20 @@ def __len__(self): def find_consecutive_chunks(self, df: pd.DataFrame) -> List[TimeseriesChunk]: chunks = [] - prev_ts, start_ts, size = None, None, 0 - for dt in df.index: + prev_ts, start_ts, start_i = None, None, 0 + for i, dt in enumerate(df.index): if prev_ts is None: prev_ts = dt start_ts = dt else: if dt - prev_ts == self.step: - size += 1 prev_ts = dt else: - chunks.append(TimeseriesChunk(start_ts, prev_ts, size)) - prev_ts, start_ts, size = None, None, 0 + chunks.append(TimeseriesChunk(start_ts, prev_ts, i - start_i)) + prev_ts, start_ts = None, None + start_i = i if start_ts is not None: - chunks.append(TimeseriesChunk(start_ts, prev_ts, size)) + chunks.append(TimeseriesChunk(start_ts, prev_ts, len(df.index) - start_i)) return chunks def create_generator_containers( @@ -909,10 +910,11 @@ def create_generator_containers( ) except ValueError: self.failed_chunks.append(chunk) - length = len(generator) - generator_containers.append( - TimeseriesGeneratorContainer(generator, chunk, length) - ) + else: + length = len(generator) + generator_containers.append( + TimeseriesGeneratorContainer(generator, chunk, length) + ) return generator_containers def __getitem__(self, index): diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py new file mode 100644 index 000000000..75c572e84 --- /dev/null +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -0,0 +1,49 @@ +import pandas as pd +from itertools import chain +from random import randrange + +from gordo.machine.model.models import GordoTimeseriesGenerator, TimeseriesChunk + +def get_test_datetimeindex(time_intervals, freq=None): + if freq is None: + freq = 'H' + dti_iters = (pd.date_range(d, periods=p, freq=freq) for d, p in time_intervals) + return pd.DatetimeIndex(list(chain(*dti_iters))) + +def random_gen(min_value=80, max_value=100): + def generate(values_count): + for v in range(values_count): + yield randrange(min_value, max_value) + return generate + +def get_test_df(time_intervals, generator=None, freq=None, tags_count=3): + if generator is None: + generator = random_gen() + dti = get_test_datetimeindex(time_intervals, freq) + tag_names = ['tag%d' % v for v in range(tags_count)] + data = {k: [] for k in tag_names} + generate_count=len(dti) + for _ in range(generate_count): + for tag_name, value in zip(tag_names, generator(tags_count)): + data[tag_name].append(value) + return pd.DataFrame(data, index=dti).sort_index() + +def test_find_consecutive_chunks(): + test1_time_intervals = ( + ('2018-01-01', 8), + ('2018-01-02', 45), + ('2018-01-04', 10), + ('2018-01-05', 30), + ('2018-02-03', 20), + ) + test1_df = get_test_df(test1_time_intervals) + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step=60) + expected_chunks = [TimeseriesChunk(start_ts=pd.Timestamp('2018-01-01 00:00:00'), end_ts=pd.Timestamp('2018-01-01 07:00:00'), size=8), + TimeseriesChunk(start_ts=pd.Timestamp('2018-01-02 01:00:00'), end_ts=pd.Timestamp('2018-01-03 20:00:00'), size=45), + TimeseriesChunk(start_ts=pd.Timestamp('2018-01-04 01:00:00'), end_ts=pd.Timestamp('2018-01-04 09:00:00'), size=10), + TimeseriesChunk(start_ts=pd.Timestamp('2018-01-05 01:00:00'), end_ts=pd.Timestamp('2018-01-06 05:00:00'), size=30), + TimeseriesChunk(start_ts=pd.Timestamp('2018-02-03 01:00:00'), end_ts=pd.Timestamp('2018-02-03 19:00:00'), size=20)] + assert len(gen.consecutive_chunks) == len(expected_chunks) + for chunk, expected_chunk in zip(gen.consecutive_chunks, expected_chunks): + assert chunk == expected_chunk + From a043bb35aeebd7ad0091861da53912ad33997e84 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 15:16:55 +0300 Subject: [PATCH 43/61] test_create_generator_containers() --- .../model/test_gordo_timeseries_generator.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py index 75c572e84..70aae1ac2 100644 --- a/tests/gordo/machine/model/test_gordo_timeseries_generator.py +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -47,3 +47,23 @@ def test_find_consecutive_chunks(): for chunk, expected_chunk in zip(gen.consecutive_chunks, expected_chunks): assert chunk == expected_chunk +def test_create_generator_containers(): + test1_time_intervals = ( + ('2018-01-01', 4), + ('2018-01-02', 35), + ('2018-01-04', 10), + ) + test1_df = get_test_df(test1_time_intervals) + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step=60) + expected_generator_containers = [ + {'chunk': TimeseriesChunk(start_ts=pd.Timestamp('2018-01-02 01:00:00'), end_ts=pd.Timestamp('2018-01-03 10:00:00'), size=35), 'length': 1}, + {'chunk': TimeseriesChunk(start_ts=pd.Timestamp('2018-01-04 01:00:00'), end_ts=pd.Timestamp('2018-01-04 09:00:00'), size=10), 'length': 1}, + ] + assert len(gen.generators_containers) == 2 + for i, generator_container in enumerate(gen.generators_containers): + for k, v in expected_generator_containers[i].items(): + assert getattr(generator_container, k) == v, "%s.%s != %s" % (generator_container, k, v) + expected_failed_chunk = TimeseriesChunk(start_ts=pd.Timestamp('2018-01-01 00:00:00'), end_ts=pd.Timestamp('2018-01-01 03:00:00'), size=4) + assert len(gen.failed_chunks) == 1 + assert gen.failed_chunks[0] == expected_failed_chunk + From 36a429bbf894fa7f9a06368806eb6e55fe0b83f9 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 15:51:26 +0300 Subject: [PATCH 44/61] test_timeseries_generator() --- .../model/test_gordo_timeseries_generator.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py index 70aae1ac2..994bca997 100644 --- a/tests/gordo/machine/model/test_gordo_timeseries_generator.py +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -1,6 +1,10 @@ +import pytest + import pandas as pd from itertools import chain from random import randrange +from itertools import count +from numpy import ndarray from gordo.machine.model.models import GordoTimeseriesGenerator, TimeseriesChunk @@ -16,6 +20,14 @@ def generate(values_count): yield randrange(min_value, max_value) return generate +def range_gen(): + g=count() + def generate(values_count): + ret_value = next(g) + for v in range(values_count): + yield ret_value + return generate + def get_test_df(time_intervals, generator=None, freq=None, tags_count=3): if generator is None: generator = random_gen() @@ -67,3 +79,45 @@ def test_create_generator_containers(): assert len(gen.failed_chunks) == 1 assert gen.failed_chunks[0] == expected_failed_chunk +def test_timeseries_generator(): + test1_time_intervals = ( + ('2018-01-02', 15), + ('2018-01-04', 10), + ) + test1_df = get_test_df(test1_time_intervals, generator=range_gen(), tags_count=1) + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, batch_size=3, step=60) + assert len(gen.generators_containers) == 2 + assert len(gen) == 6 + x, y = gen[0] + expect_x=[[[0], + [1], + [2], + [3], + [4]], + [[1], + [2], + [3], + [4], + [5]], + + [[2], + [3], + [4], + [5], + [6]]] + expect_y=[[5], + [6], + [7]] + assert x.tolist() == expect_x + assert y.tolist() == expect_y + +def test_too_short_timeseries_length(): + test1_time_intervals = ( + ('2018-01-01', 4), + ('2018-01-02', 6), + ('2018-01-04', 8), + ) + test1_df = get_test_df(test1_time_intervals) + with pytest.raises(ValueError): + GordoTimeseriesGenerator(test1_df, test1_df, length=10, step=60) + From a08a44ab602e5771f70d7ac63ffd3e9ed3895405 Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 15:52:07 +0300 Subject: [PATCH 45/61] black --- gordo/machine/model/models.py | 16 ++- .../model/test_gordo_timeseries_generator.py | 133 ++++++++++++------ 2 files changed, 99 insertions(+), 50 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 71c87576b..b741dffd6 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -851,15 +851,23 @@ def __init__( step = pd.Timedelta(minutes=step) self.step = step self.consecutive_chunks = self.find_consecutive_chunks(data) - logger.debug('GordoTimeseriesGenerator with consecutive_chunks=%s', self.consecutive_chunks) + logger.debug( + "GordoTimeseriesGenerator with consecutive_chunks=%s", + self.consecutive_chunks, + ) self.failed_chunks: List[TimeseriesChunk] = [] self.generators_containers = self.create_generator_containers( data, targets, length=length, batch_size=batch_size, shuffle=shuffle ) - logger.debug('GordoTimeseriesGenerator with generators_containers=%s', self.generators_containers) + logger.debug( + "GordoTimeseriesGenerator with generators_containers=%s", + self.generators_containers, + ) if not self.generators_containers: - raise ValueError("Seems like the time series are too small or in random order." - "Failed chunks: %s" % self.consecutive_chunks) + raise ValueError( + "Seems like the time series are too small or in random order." + "Failed chunks: %s" % self.consecutive_chunks + ) def filter_chunks(self, indexes=None): if indexes is not None: diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py index 994bca997..4962594ba 100644 --- a/tests/gordo/machine/model/test_gordo_timeseries_generator.py +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -8,116 +8,157 @@ from gordo.machine.model.models import GordoTimeseriesGenerator, TimeseriesChunk + def get_test_datetimeindex(time_intervals, freq=None): if freq is None: - freq = 'H' + freq = "H" dti_iters = (pd.date_range(d, periods=p, freq=freq) for d, p in time_intervals) return pd.DatetimeIndex(list(chain(*dti_iters))) + def random_gen(min_value=80, max_value=100): def generate(values_count): for v in range(values_count): yield randrange(min_value, max_value) + return generate + def range_gen(): - g=count() + g = count() + def generate(values_count): ret_value = next(g) for v in range(values_count): yield ret_value + return generate + def get_test_df(time_intervals, generator=None, freq=None, tags_count=3): if generator is None: generator = random_gen() dti = get_test_datetimeindex(time_intervals, freq) - tag_names = ['tag%d' % v for v in range(tags_count)] + tag_names = ["tag%d" % v for v in range(tags_count)] data = {k: [] for k in tag_names} - generate_count=len(dti) + generate_count = len(dti) for _ in range(generate_count): for tag_name, value in zip(tag_names, generator(tags_count)): data[tag_name].append(value) return pd.DataFrame(data, index=dti).sort_index() + def test_find_consecutive_chunks(): test1_time_intervals = ( - ('2018-01-01', 8), - ('2018-01-02', 45), - ('2018-01-04', 10), - ('2018-01-05', 30), - ('2018-02-03', 20), + ("2018-01-01", 8), + ("2018-01-02", 45), + ("2018-01-04", 10), + ("2018-01-05", 30), + ("2018-02-03", 20), ) test1_df = get_test_df(test1_time_intervals) gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step=60) - expected_chunks = [TimeseriesChunk(start_ts=pd.Timestamp('2018-01-01 00:00:00'), end_ts=pd.Timestamp('2018-01-01 07:00:00'), size=8), - TimeseriesChunk(start_ts=pd.Timestamp('2018-01-02 01:00:00'), end_ts=pd.Timestamp('2018-01-03 20:00:00'), size=45), - TimeseriesChunk(start_ts=pd.Timestamp('2018-01-04 01:00:00'), end_ts=pd.Timestamp('2018-01-04 09:00:00'), size=10), - TimeseriesChunk(start_ts=pd.Timestamp('2018-01-05 01:00:00'), end_ts=pd.Timestamp('2018-01-06 05:00:00'), size=30), - TimeseriesChunk(start_ts=pd.Timestamp('2018-02-03 01:00:00'), end_ts=pd.Timestamp('2018-02-03 19:00:00'), size=20)] + expected_chunks = [ + TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-01 00:00:00"), + end_ts=pd.Timestamp("2018-01-01 07:00:00"), + size=8, + ), + TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-02 01:00:00"), + end_ts=pd.Timestamp("2018-01-03 20:00:00"), + size=45, + ), + TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-04 01:00:00"), + end_ts=pd.Timestamp("2018-01-04 09:00:00"), + size=10, + ), + TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-05 01:00:00"), + end_ts=pd.Timestamp("2018-01-06 05:00:00"), + size=30, + ), + TimeseriesChunk( + start_ts=pd.Timestamp("2018-02-03 01:00:00"), + end_ts=pd.Timestamp("2018-02-03 19:00:00"), + size=20, + ), + ] assert len(gen.consecutive_chunks) == len(expected_chunks) for chunk, expected_chunk in zip(gen.consecutive_chunks, expected_chunks): assert chunk == expected_chunk + def test_create_generator_containers(): test1_time_intervals = ( - ('2018-01-01', 4), - ('2018-01-02', 35), - ('2018-01-04', 10), + ("2018-01-01", 4), + ("2018-01-02", 35), + ("2018-01-04", 10), ) test1_df = get_test_df(test1_time_intervals) gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step=60) expected_generator_containers = [ - {'chunk': TimeseriesChunk(start_ts=pd.Timestamp('2018-01-02 01:00:00'), end_ts=pd.Timestamp('2018-01-03 10:00:00'), size=35), 'length': 1}, - {'chunk': TimeseriesChunk(start_ts=pd.Timestamp('2018-01-04 01:00:00'), end_ts=pd.Timestamp('2018-01-04 09:00:00'), size=10), 'length': 1}, + { + "chunk": TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-02 01:00:00"), + end_ts=pd.Timestamp("2018-01-03 10:00:00"), + size=35, + ), + "length": 1, + }, + { + "chunk": TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-04 01:00:00"), + end_ts=pd.Timestamp("2018-01-04 09:00:00"), + size=10, + ), + "length": 1, + }, ] assert len(gen.generators_containers) == 2 for i, generator_container in enumerate(gen.generators_containers): for k, v in expected_generator_containers[i].items(): - assert getattr(generator_container, k) == v, "%s.%s != %s" % (generator_container, k, v) - expected_failed_chunk = TimeseriesChunk(start_ts=pd.Timestamp('2018-01-01 00:00:00'), end_ts=pd.Timestamp('2018-01-01 03:00:00'), size=4) + assert getattr(generator_container, k) == v, "%s.%s != %s" % ( + generator_container, + k, + v, + ) + expected_failed_chunk = TimeseriesChunk( + start_ts=pd.Timestamp("2018-01-01 00:00:00"), + end_ts=pd.Timestamp("2018-01-01 03:00:00"), + size=4, + ) assert len(gen.failed_chunks) == 1 assert gen.failed_chunks[0] == expected_failed_chunk + def test_timeseries_generator(): test1_time_intervals = ( - ('2018-01-02', 15), - ('2018-01-04', 10), + ("2018-01-02", 15), + ("2018-01-04", 10), ) test1_df = get_test_df(test1_time_intervals, generator=range_gen(), tags_count=1) gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, batch_size=3, step=60) assert len(gen.generators_containers) == 2 assert len(gen) == 6 x, y = gen[0] - expect_x=[[[0], - [1], - [2], - [3], - [4]], - [[1], - [2], - [3], - [4], - [5]], - - [[2], - [3], - [4], - [5], - [6]]] - expect_y=[[5], - [6], - [7]] + expect_x = [ + [[0], [1], [2], [3], [4]], + [[1], [2], [3], [4], [5]], + [[2], [3], [4], [5], [6]], + ] + expect_y = [[5], [6], [7]] assert x.tolist() == expect_x assert y.tolist() == expect_y + def test_too_short_timeseries_length(): test1_time_intervals = ( - ('2018-01-01', 4), - ('2018-01-02', 6), - ('2018-01-04', 8), + ("2018-01-01", 4), + ("2018-01-02", 6), + ("2018-01-04", 8), ) test1_df = get_test_df(test1_time_intervals) with pytest.raises(ValueError): GordoTimeseriesGenerator(test1_df, test1_df, length=10, step=60) - From 7e68135c3bbb56bfe5de1ba7f324e136a6abb5fd Mon Sep 17 00:00:00 2001 From: Serhii Date: Thu, 9 Apr 2020 15:52:27 +0300 Subject: [PATCH 46/61] Remove unused import --- tests/gordo/machine/model/test_gordo_timeseries_generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py index 4962594ba..660f1a7c4 100644 --- a/tests/gordo/machine/model/test_gordo_timeseries_generator.py +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -4,7 +4,6 @@ from itertools import chain from random import randrange from itertools import count -from numpy import ndarray from gordo.machine.model.models import GordoTimeseriesGenerator, TimeseriesChunk From f3add874ed772a00d0b425abbfae05f4b61be661 Mon Sep 17 00:00:00 2001 From: Serhii Date: Fri, 10 Apr 2020 14:25:52 +0300 Subject: [PATCH 47/61] timeseries_generators() --- gordo/machine/model/models.py | 69 +++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index b741dffd6..39448ef97 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -10,6 +10,7 @@ from copy import copy, deepcopy from importlib.util import find_spec from dataclasses import dataclass +from copy import copy import h5py import tensorflow.keras.models @@ -474,6 +475,7 @@ def __init__( kind: Union[Callable, str], lookback_window: int = 1, batch_size: int = 32, + timeseries_generator: Optional[Dict[str, Any]] = None, **kwargs, ) -> None: """ @@ -506,6 +508,8 @@ def __init__( kwargs["kind"] = kind kwargs["batch_size"] = batch_size + self.timeseries_generator_config = timeseries_generator + # fit_generator_params is a set of strings with the keyword arguments of # Keras fit_generator method (excluding "shuffle" as this will be hardcoded). # This will be used in the fit method of the respective subclasses to match @@ -597,6 +601,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": batch_size=1, lookback_window=self.lookback_window, lookahead=self.lookahead, + config=self.timeseries_generator_config, ) primer_x, primer_y = tsg[0] @@ -609,6 +614,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": batch_size=self.batch_size, lookback_window=self.lookback_window, lookahead=self.lookahead, + config=self.timeseries_generator_config, ) gen_kwargs = { @@ -746,6 +752,7 @@ def create_keras_timeseriesgenerator( batch_size: int, lookback_window: int, lookahead: int, + config: Optional[Dict[str, Any]] = None, ) -> TimeseriesGenerator: """ Provides a `keras.preprocessing.sequence.TimeseriesGenerator` for use with @@ -798,19 +805,47 @@ def create_keras_timeseriesgenerator( >>> len(gen[0][0][0][0]) # n_features = 2 2 """ + X, y = pad_x_and_y(X, y, lookahead) + return timeseries_generators.create_from_config( + config, data=X, targets=y, length=lookback_window, batch_size=batch_size + ) - if isinstance(X, pd.DataFrame): - if not isinstance(y, pd.DataFrame): - raise ValueError("'y' should be an instance of pandas.DataFrame") - # TODO padding for X and y - return GordoTimeseriesGenerator( - data=X, targets=y, length=lookback_window, batch_size=batch_size - ) - else: - X, y = pad_x_and_y(X, y, lookahead) - return TimeseriesGenerator( - data=X, targets=y, length=lookback_window, batch_size=batch_size - ) + +class TimeseriesGeneratorTypes: + def __init__(self, default_type): + self.default_type = default_type + self._types = {} + + def create_from_config(self, config, **kwargs): + if config is None: + return self.default_type(**kwargs) + else: + if "type" not in config: + raise ValueError( + 'Unspecified "type" attribute for "timeseries_generator"' + ) + type_name = config["type"] + if type_name not in self._types: + raise ValueError( + f'Unknown type "{type_name}" for "timeseries_generator"' + ) + all_kwargs = copy(config).pop("type") + all_kwargs.update(kwargs) + return self._types[type_name](**all_kwargs) + + def __call__(self, type_name): + def wrap(cls): + if type_name in self._types: + raise ValueError( + f'TimeseriesGenerator type with name "{type_name}" already exists' + ) + self._types[type_name] = cls + return cls + + return wrap + + +timeseries_generators = TimeseriesGeneratorTypes(default_type=TimeseriesGenerator) @dataclass @@ -827,17 +862,21 @@ class TimeseriesGeneratorContainer: length: int +@timeseries_generators("GordoTimeseriesGenerator") class GordoTimeseriesGenerator(data_utils.Sequence): def __init__( self, - data: pd.DataFrame, - targets: pd.DataFrame, + data: Union[pd.DataFrame, np.ndarray], + targets: Union[pd.DataFrame, np.ndarray], length: int, batch_size: int = 128, shuffle: bool = False, step: Optional[Union[pd.Timedelta, int]] = None, ): - + if not isinstance(data, pd.DataFrame): + raise ValueError("Data have to be instance of pandas.DataFrame") + if not isinstance(targets, pd.DataFrame): + raise ValueError("Targets have to be instance of pandas.DataFrame") if len(data) != len(targets): raise ValueError( "Data and targets have to be of same length. " From 8da71bcc31a0b42ecaef833859755904289e14d3 Mon Sep 17 00:00:00 2001 From: Serhii Date: Fri, 10 Apr 2020 14:43:38 +0300 Subject: [PATCH 48/61] Use names of pd.Timedelta instead of number of minutes in config --- gordo/machine/model/models.py | 8 +++----- .../machine/model/test_gordo_timeseries_generator.py | 8 ++++---- tests/gordo/machine/model/test_model.py | 3 ++- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 39448ef97..b150e8c0a 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -871,7 +871,7 @@ def __init__( length: int, batch_size: int = 128, shuffle: bool = False, - step: Optional[Union[pd.Timedelta, int]] = None, + step: Union[pd.Timedelta, str] = '10min', ): if not isinstance(data, pd.DataFrame): raise ValueError("Data have to be instance of pandas.DataFrame") @@ -884,10 +884,8 @@ def __init__( f" while target length is {len(targets)}" ) - if step is None: - step = pd.Timedelta(minutes=10) - if isinstance(step, int): - step = pd.Timedelta(minutes=step) + if isinstance(step, str): + step = pd.to_timedelta(step) self.step = step self.consecutive_chunks = self.find_consecutive_chunks(data) logger.debug( diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py index 660f1a7c4..6d763fe79 100644 --- a/tests/gordo/machine/model/test_gordo_timeseries_generator.py +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -56,7 +56,7 @@ def test_find_consecutive_chunks(): ("2018-02-03", 20), ) test1_df = get_test_df(test1_time_intervals) - gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step=60) + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step='60min') expected_chunks = [ TimeseriesChunk( start_ts=pd.Timestamp("2018-01-01 00:00:00"), @@ -96,7 +96,7 @@ def test_create_generator_containers(): ("2018-01-04", 10), ) test1_df = get_test_df(test1_time_intervals) - gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step=60) + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step='60min') expected_generator_containers = [ { "chunk": TimeseriesChunk( @@ -138,7 +138,7 @@ def test_timeseries_generator(): ("2018-01-04", 10), ) test1_df = get_test_df(test1_time_intervals, generator=range_gen(), tags_count=1) - gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, batch_size=3, step=60) + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, batch_size=3, step='60min') assert len(gen.generators_containers) == 2 assert len(gen) == 6 x, y = gen[0] @@ -160,4 +160,4 @@ def test_too_short_timeseries_length(): ) test1_df = get_test_df(test1_time_intervals) with pytest.raises(ValueError): - GordoTimeseriesGenerator(test1_df, test1_df, length=10, step=60) + GordoTimeseriesGenerator(test1_df, test1_df, length=10, step='60min') diff --git a/tests/gordo/machine/model/test_model.py b/tests/gordo/machine/model/test_model.py index c2d0fdadf..f8e2bd029 100644 --- a/tests/gordo/machine/model/test_model.py +++ b/tests/gordo/machine/model/test_model.py @@ -337,7 +337,6 @@ def test_lstmae_predict_output(): out = model.predict(xTest) assert out.shape == (2, 3) - def test_keras_autoencoder_fits_callbacks(): model = KerasAutoEncoder( kind="feedforward_hourglass", @@ -410,3 +409,5 @@ def test_for_wrong_kind_import(): X, y = np.random.rand(10, 10), np.random.rand(10, 10) with pytest.raises(ValueError): model.fit(X, y) + +# TODO test with GordoTimeseriesGenerator From 7a24de233f4250d5933e6f5138824fd42eafde4c Mon Sep 17 00:00:00 2001 From: Serhii Date: Fri, 10 Apr 2020 18:29:59 +0300 Subject: [PATCH 49/61] Fix issues with of GordoTimeseriesGenerator and local_build() --- gordo/machine/model/models.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index b150e8c0a..e807227f1 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -507,8 +507,7 @@ def __init__( kwargs["lookback_window"] = lookback_window kwargs["kind"] = kind kwargs["batch_size"] = batch_size - - self.timeseries_generator_config = timeseries_generator + kwargs["timeseries_generator"] = timeseries_generator # fit_generator_params is a set of strings with the keyword arguments of # Keras fit_generator method (excluding "shuffle" as this will be hardcoded). @@ -539,6 +538,10 @@ def lookahead(self) -> int: """Steps ahead in y the model should target""" ... + @property + def timeseries_generator(self): + return self.kwargs.get('timeseries_generator', None) + def get_metadata(self): """ Add number of forecast steps to metadata @@ -601,7 +604,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": batch_size=1, lookback_window=self.lookback_window, lookahead=self.lookahead, - config=self.timeseries_generator_config, + config=self.timeseries_generator, ) primer_x, primer_y = tsg[0] @@ -614,7 +617,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs) -> "KerasLSTMForecast": batch_size=self.batch_size, lookback_window=self.lookback_window, lookahead=self.lookahead, - config=self.timeseries_generator_config, + config=self.timeseries_generator, ) gen_kwargs = { @@ -674,6 +677,7 @@ def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: batch_size=10000, lookback_window=self.lookback_window, lookahead=self.lookahead, + config=self.timeseries_generator, ) return self.model.predict_generator(tsg) @@ -829,7 +833,8 @@ def create_from_config(self, config, **kwargs): raise ValueError( f'Unknown type "{type_name}" for "timeseries_generator"' ) - all_kwargs = copy(config).pop("type") + all_kwargs = copy(config) + all_kwargs.pop("type") all_kwargs.update(kwargs) return self._types[type_name](**all_kwargs) From 3631f72720c74f47d53cfa951073ff88f82e0ed5 Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 12:11:16 +0300 Subject: [PATCH 50/61] sklearn-pandas~=1.8.0 --- requirements/full_requirements.txt | 184 ++++++++++++++--------------- requirements/requirements.in | 2 + 2 files changed, 94 insertions(+), 92 deletions(-) diff --git a/requirements/full_requirements.txt b/requirements/full_requirements.txt index e311916a8..366cbd61d 100644 --- a/requirements/full_requirements.txt +++ b/requirements/full_requirements.txt @@ -4,147 +4,147 @@ # # pip-compile --output-file=full_requirements.txt mlflow_requirements.in postgres_requirements.in requirements.in # -absl-py==0.9.0 # via tensorboard, tensorflow -adal==1.2.2 # via azure-datalake-store, azureml-core, msrestazure -alembic==1.3.3 # via mlflow -aniso8601==8.0.0 # via flask-restplus +absl-py==0.11.0 # via tensorboard, tensorflow +adal==1.2.6 # via azure-datalake-store, azureml-core, msrestazure +alembic==1.4.1 # via mlflow +aniso8601==8.1.1 # via flask-restplus astor==0.8.1 # via tensorflow -attrs==19.3.0 # via jsonschema -azure-common==1.1.24 # via azure-graphrbac, azure-mgmt-authorization, azure-mgmt-containerregistry, azure-mgmt-keyvault, azure-mgmt-resource, azure-mgmt-storage, azureml-core -azure-core==1.8.1 # via azure-identity, azure-storage-blob, azure-storage-file-datalake +attrs==20.3.0 # via jsonschema +azure-common==1.1.26 # via azure-graphrbac, azure-mgmt-authorization, azure-mgmt-containerregistry, azure-mgmt-keyvault, azure-mgmt-resource, azure-mgmt-storage, azureml-core +azure-core==1.11.0 # via azure-identity, azure-storage-blob, azure-storage-file-datalake azure-datalake-store==0.0.51 # via gordo-dataset azure-graphrbac==0.61.1 # via azureml-core -azure-identity==1.4.0 # via -r requirements.in, gordo-dataset -azure-mgmt-authorization==0.60.0 # via azureml-core +azure-identity==1.4.1 # via -r requirements.in, gordo-dataset +azure-mgmt-authorization==0.61.0 # via azureml-core azure-mgmt-containerregistry==2.8.0 # via azureml-core -azure-mgmt-keyvault==2.0.0 # via azureml-core -azure-mgmt-resource==8.0.0 # via azureml-core -azure-mgmt-storage==7.1.0 # via azureml-core -azure-storage-blob==12.4.0 # via azure-storage-file-datalake -azure-storage-file-datalake==12.1.2 # via gordo-dataset -azureml-contrib-run==1.0.85 # via -r mlflow_requirements.in -azureml-core==1.0.85 # via azureml-mlflow -azureml-mlflow==1.0.85 # via azureml-contrib-run +azure-mgmt-keyvault==2.2.0 # via azureml-core +azure-mgmt-resource==12.0.0 # via azureml-core +azure-mgmt-storage==11.2.0 # via azureml-core +azure-storage-blob==12.7.1 # via azure-storage-file-datalake, mlflow +azure-storage-file-datalake==12.2.3 # via gordo-dataset +azureml-contrib-run==1.22.0 # via -r mlflow_requirements.in +azureml-core==1.22.0 # via azureml-mlflow +azureml-mlflow==1.22.0 # via azureml-contrib-run backports.tempfile==1.0 # via azureml-core backports.weakref==1.0.post1 # via backports.tempfile -cachetools==4.1.1 # via google-auth, gordo-dataset, gordo.client -catboost==0.20.2 # via -r requirements.in -cchardet==2.1.5 # via -r requirements.in -certifi==2019.11.28 # via msrest, requests -cffi==1.13.2 # via azure-datalake-store, cryptography -chardet==3.0.4 # via requests +cachetools==4.2.1 # via google-auth, gordo-dataset, gordo.client +catboost==0.24.4 # via -r requirements.in +cchardet==2.1.7 # via -r requirements.in +certifi==2020.12.5 # via msrest, requests +cffi==1.14.5 # via azure-datalake-store, cryptography +chardet==4.0.0 # via requests click==7.1.2 # via -r requirements.in, databricks-cli, flask, gordo.client, mlflow -cloudpickle==1.2.2 # via mlflow -configparser==4.0.2 # via databricks-cli +cloudpickle==1.6.0 # via mlflow contextlib2==0.6.0.post1 # via azureml-core -cryptography==3.3.1 # via adal, azure-identity, azure-storage-blob, azureml-core, gordo-dataset, pyjwt, pyopenssl, secretstorage +cryptography==3.4.6 # via adal, azure-identity, azure-storage-blob, azureml-core, gordo-dataset, msal, pyjwt, pyopenssl, secretstorage cycler==0.10.0 # via matplotlib -databricks-cli==0.9.1 # via mlflow -dataclasses-json==0.3.7 # via -r requirements.in +databricks-cli==0.14.1 # via mlflow +dataclasses-json==0.5.2 # via -r requirements.in dictdiffer==0.8.1 # via -r requirements.in -docker==4.1.0 # via azureml-core, mlflow +docker==4.4.3 # via azureml-core, mlflow entrypoints==0.3 # via mlflow flask-restplus==0.13.0 # via -r requirements.in -flask==1.1.1 # via -r requirements.in, flask-restplus, mlflow, prometheus-flask-exporter +flask==1.1.2 # via -r requirements.in, flask-restplus, mlflow, prometheus-flask-exporter gast==0.2.2 # via tensorflow -gitdb2==2.0.6 # via gitpython -gitpython==3.0.5 # via mlflow -google-auth-oauthlib==0.4.1 # via tensorboard -google-auth==1.10.1 # via google-auth-oauthlib, tensorboard -google-pasta==0.1.8 # via tensorflow -gordo-dataset==2.4.0 # via -r requirements.in, gordo.client +gitdb==4.0.5 # via gitpython +gitpython==3.1.13 # via mlflow +google-auth-oauthlib==0.4.2 # via tensorboard +google-auth==1.27.0 # via google-auth-oauthlib, tensorboard +google-pasta==0.2.0 # via tensorflow +gordo-dataset==2.4.1 # via -r requirements.in, gordo.client gordo.client==0.2.12 # via -r requirements.in -gorilla==0.3.0 # via mlflow -graphviz==0.13.2 # via catboost -grpcio==1.26.0 # via tensorboard, tensorflow +graphviz==0.16 # via catboost +grpcio==1.35.0 # via tensorboard, tensorflow gunicorn==20.0.4 # via -r requirements.in, mlflow h5py==2.10.0 # via -r requirements.in, keras-applications, tensorflow -idna==2.8 # via requests -importlib-metadata==1.4.0 # via jsonschema -influxdb==5.3.0 # via gordo-dataset, gordo.client +idna==2.10 # via requests +importlib-metadata==3.4.0 # via jsonpickle, jsonschema, markdown +influxdb==5.3.1 # via gordo-dataset, gordo.client isodate==0.6.0 # via msrest itsdangerous==1.1.0 # via flask jeepney==0.6.0 # via -r requirements.in, secretstorage -jinja2==2.10.3 # via -r requirements.in, flask -jmespath==0.9.4 # via azureml-core -joblib==0.14.1 # via scikit-learn -jsonpickle==1.2 # via azureml-core, azureml-mlflow +jinja2==2.11.3 # via -r requirements.in, flask +jmespath==0.10.0 # via azureml-core +joblib==1.0.1 # via scikit-learn +jsonpickle==2.0.0 # via azureml-core, azureml-mlflow jsonschema==3.2.0 # via flask-restplus keras-applications==1.0.8 # via tensorflow keras-preprocessing==1.1.0 # via tensorflow -kiwisolver==1.1.0 # via matplotlib -mako==1.1.1 # via alembic -markdown==3.1.1 # via tensorboard +kiwisolver==1.3.1 # via matplotlib +mako==1.1.4 # via alembic +markdown==3.3.3 # via tensorboard markupsafe==1.1.1 # via jinja2, mako marshmallow-enum==1.5.1 # via dataclasses-json -marshmallow==3.3.0 # via dataclasses-json, gordo-dataset, marshmallow-enum -matplotlib==3.1.2 # via catboost -mlflow==1.5.0 # via -r mlflow_requirements.in, azureml-mlflow -more-itertools==8.1.0 # via zipp +marshmallow==3.10.0 # via dataclasses-json, gordo-dataset, marshmallow-enum +matplotlib==3.3.4 # via catboost +mlflow==1.13.1 # via -r mlflow_requirements.in, azureml-mlflow msal-extensions==0.2.2 # via azure-identity -msal==1.5.0 # via azure-identity, msal-extensions -msgpack==0.6.1 # via influxdb -msrest==0.6.10 # via azure-graphrbac, azure-mgmt-authorization, azure-mgmt-containerregistry, azure-mgmt-keyvault, azure-mgmt-resource, azure-mgmt-storage, azure-storage-blob, azure-storage-file-datalake, azureml-core, msrestazure -msrestazure==0.6.2 # via azure-graphrbac, azure-mgmt-authorization, azure-mgmt-containerregistry, azure-mgmt-keyvault, azure-mgmt-resource, azure-mgmt-storage, azureml-core +msal==1.9.0 # via azure-identity, msal-extensions +msgpack==1.0.2 # via influxdb +msrest==0.6.21 # via azure-graphrbac, azure-mgmt-authorization, azure-mgmt-containerregistry, azure-mgmt-keyvault, azure-mgmt-resource, azure-mgmt-storage, azure-storage-blob, azure-storage-file-datalake, azureml-core, msrestazure +msrestazure==0.6.4 # via azure-graphrbac, azure-mgmt-authorization, azure-mgmt-containerregistry, azure-mgmt-keyvault, azure-mgmt-resource, azure-mgmt-storage, azureml-core mypy-extensions==0.4.3 # via typing-inspect ndg-httpsclient==0.5.1 # via azureml-core -numexpr==2.7.1 # via -r requirements.in, gordo-dataset -numpy==1.18.1 # via -r requirements.in, catboost, gordo.client, h5py, keras-applications, keras-preprocessing, matplotlib, mlflow, numexpr, opt-einsum, pandas, pyarrow, scikit-learn, scipy, tensorboard, tensorflow, xarray +numexpr==2.7.2 # via -r requirements.in, gordo-dataset +numpy==1.18.5 # via -r requirements.in, catboost, gordo.client, h5py, keras-applications, keras-preprocessing, matplotlib, mlflow, numexpr, opt-einsum, pandas, pyarrow, scikit-learn, scipy, sklearn-pandas, tensorboard, tensorflow, xarray oauthlib==3.1.0 # via requests-oauthlib -opt-einsum==3.1.0 # via tensorflow -packaging==20.7 # via -r requirements.in -pandas==1.1.4 # via -r requirements.in, catboost, gordo-dataset, gordo.client, mlflow, xarray -pathspec==0.7.0 # via azureml-core -peewee==3.13.1 # via -r postgres_requirements.in -plotly==4.4.1 # via catboost +opt-einsum==3.3.0 # via tensorflow +packaging==20.9 # via -r requirements.in +pandas==1.2.2 # via -r requirements.in, catboost, gordo-dataset, gordo.client, mlflow, sklearn-pandas, xarray +pathspec==0.8.1 # via azureml-core +peewee==3.14.1 # via -r postgres_requirements.in +pillow==8.1.0 # via matplotlib +plotly==4.14.3 # via catboost portalocker==1.7.1 # via msal-extensions prometheus-client==0.7.1 # via -r requirements.in, prometheus-flask-exporter -prometheus-flask-exporter==0.12.1 # via mlflow -protobuf==3.11.2 # via mlflow, tensorboard, tensorflow +prometheus-flask-exporter==0.18.1 # via mlflow +protobuf==3.14.0 # via mlflow, tensorboard, tensorflow psycopg2-binary==2.8.4 # via -r postgres_requirements.in pyarrow==0.17.1 # via gordo-dataset, gordo.client pyasn1-modules==0.2.8 # via google-auth pyasn1==0.4.8 # via ndg-httpsclient, pyasn1-modules, rsa -pycparser==2.19 # via cffi +pycparser==2.20 # via cffi pydantic==1.7.3 # via -r requirements.in, gordo.client -pyjwt[crypto]==1.7.1 # via adal, azureml-core, msal -pyopenssl==19.1.0 # via azureml-core, ndg-httpsclient -pyparsing==2.4.6 # via matplotlib, packaging -pyrsistent==0.15.7 # via jsonschema +pyjwt[crypto]==2.0.1 # via adal, azureml-core, msal +pyopenssl==20.0.1 # via azureml-core, ndg-httpsclient +pyparsing==2.4.7 # via matplotlib, packaging +pyrsistent==0.17.3 # via jsonschema python-dateutil==2.8.1 # via -r requirements.in, adal, alembic, azureml-core, influxdb, matplotlib, mlflow, pandas python-editor==1.0.4 # via alembic -pytz==2019.3 # via azureml-core, flask-restplus, influxdb, pandas +pytz==2021.1 # via azureml-core, flask-restplus, influxdb, pandas pyyaml==5.3.1 # via -r requirements.in, gordo-dataset, mlflow querystring-parser==1.2.4 # via mlflow requests-oauthlib==1.3.0 # via google-auth-oauthlib, msrest -requests==2.22.0 # via -r requirements.in, adal, azure-core, azure-datalake-store, azureml-core, databricks-cli, docker, gordo.client, influxdb, mlflow, msal, msrest, requests-oauthlib, tensorboard +requests==2.25.1 # via -r requirements.in, adal, azure-core, azure-datalake-store, azureml-core, databricks-cli, docker, gordo.client, influxdb, mlflow, msal, msrest, requests-oauthlib, tensorboard retrying==1.3.3 # via plotly -rsa==4.0 # via google-auth -ruamel.yaml==0.15.89 # via azureml-core -scikit-learn==0.23.2 # via -r requirements.in, gordo-dataset, gordo.client -scipy==1.4.1 # via catboost, scikit-learn -secretstorage==3.1.2 # via azureml-core -simplejson==3.17.2 # via -r requirements.in, gordo.client, mlflow -six==1.14.0 # via absl-py, azure-core, azure-identity, azureml-core, catboost, cryptography, cycler, databricks-cli, docker, flask-restplus, google-auth, google-pasta, grpcio, h5py, influxdb, isodate, jsonschema, keras-preprocessing, mlflow, plotly, protobuf, pyopenssl, pyrsistent, python-dateutil, querystring-parser, retrying, tensorboard, tensorflow, websocket-client -smmap2==2.0.5 # via gitdb2 -sqlalchemy==1.3.13 # via alembic, mlflow -sqlparse==0.3.0 # via mlflow +rsa==4.7.1 # via google-auth +ruamel.yaml.clib==0.2.2 # via ruamel.yaml +ruamel.yaml==0.16.12 # via azureml-core +scikit-learn==0.23.2 # via -r requirements.in, gordo-dataset, gordo.client, sklearn-pandas +scipy==1.6.1 # via catboost, scikit-learn, sklearn-pandas +secretstorage==3.3.1 # via azureml-core +simplejson==3.17.2 # via -r requirements.in, gordo.client +six==1.15.0 # via absl-py, azure-core, azure-identity, catboost, cycler, databricks-cli, docker, flask-restplus, google-auth, google-pasta, grpcio, h5py, influxdb, isodate, jsonschema, keras-preprocessing, mlflow, msrestazure, plotly, protobuf, pyopenssl, python-dateutil, querystring-parser, retrying, tensorboard, tensorflow, websocket-client +sklearn-pandas==1.8.0 # via -r requirements.in +smmap==3.0.5 # via gitdb +sqlalchemy==1.3.23 # via alembic, mlflow +sqlparse==0.4.1 # via mlflow stringcase==1.2.0 # via dataclasses-json -tabulate==0.8.6 # via databricks-cli -tensorboard==2.1.0 # via tensorflow +tabulate==0.8.8 # via databricks-cli +tensorboard==2.1.1 # via tensorflow tensorflow-estimator==2.1.0 # via tensorflow tensorflow==2.1.3 # via -r requirements.in termcolor==1.1.0 # via tensorflow threadpoolctl==2.1.0 # via scikit-learn -typing-extensions==3.7.4.1 # via -r requirements.in, gordo-dataset, typing-inspect -typing-inspect==0.5.0 # via dataclasses-json -urllib3==1.25.7 # via -r requirements.in, azureml-core, requests +typing-extensions==3.7.4.3 # via -r requirements.in, gordo-dataset, importlib-metadata, typing-inspect +typing-inspect==0.6.0 # via dataclasses-json +urllib3==1.26.3 # via -r requirements.in, azureml-core, requests websocket-client==0.57.0 # via docker werkzeug==0.16.1 # via -r requirements.in, flask, tensorboard -wheel==0.33.6 # via tensorboard, tensorflow -wrapt==1.11.2 # via gordo.client, tensorflow +wheel==0.36.2 # via tensorboard, tensorflow +wrapt==1.12.1 # via -r requirements.in, gordo.client, tensorflow xarray==0.16.2 # via gordo-dataset -zipp==2.0.0 # via importlib-metadata +zipp==3.4.0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/requirements/requirements.in b/requirements/requirements.in index 8229fcdd2..5ae6dd826 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -29,3 +29,5 @@ jeepney>=0.6 packaging~=20.7 pydantic~=1.7.3 gordo.client~=0.2.12 +wrapt~=1.11 +sklearn-pandas~=1.8.0 From 0260043397cc81f7f0250619fab4d4df195359af Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 13:38:03 +0300 Subject: [PATCH 51/61] Play around with DataFrameMapper --- gordo/machine/model/data_frame_mapper.py | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 gordo/machine/model/data_frame_mapper.py diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/machine/model/data_frame_mapper.py new file mode 100644 index 000000000..d39269059 --- /dev/null +++ b/gordo/machine/model/data_frame_mapper.py @@ -0,0 +1,29 @@ +from pydoc import locate +from sklearn_pandas import DataFrameMapper, gen_features +from copy import copy +from typing import List, Union, Optional + + +class DataFrameMapper(DataFrameMapper): + + _default_kwargs = { + "df_out": True + } + + def __init__(self, columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, **kwargs): + if classes is not None: + classes = copy(classes) + self._prepare_classes(classes) + features = gen_features(columns=columns, classes=classes) + base_kwargs = copy(self._default_kwargs) + base_kwargs.update(kwargs) + super().__init__(features=features, **kwargs) + + @staticmethod + def _prepare_classes(classes: List[dict]): + for i, v in enumerate(classes): + if "class" not in v: + raise ValueError("\"class\" attribute is empty") + if isinstance(v["class"], str): + cls = locate(v["class"]) + classes[i]["class"] = cls From 1638ad6d2b5e9c10f5aaa2eaed3d2f7d6ca2053c Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 13:39:25 +0300 Subject: [PATCH 52/61] Fix kwargs argument for DataFrameMapper --- gordo/machine/model/data_frame_mapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/machine/model/data_frame_mapper.py index d39269059..ff76a51dd 100644 --- a/gordo/machine/model/data_frame_mapper.py +++ b/gordo/machine/model/data_frame_mapper.py @@ -17,7 +17,7 @@ def __init__(self, columns: List[Union[str, List[str]]], classes: Optional[List[ features = gen_features(columns=columns, classes=classes) base_kwargs = copy(self._default_kwargs) base_kwargs.update(kwargs) - super().__init__(features=features, **kwargs) + super().__init__(features=features, **base_kwargs) @staticmethod def _prepare_classes(classes: List[dict]): From 7f399e19f047e62010ffc13f0f88d01ac0d1b4e8 Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 13:40:00 +0300 Subject: [PATCH 53/61] black --- gordo/machine/model/data_frame_mapper.py | 13 ++++++++----- gordo/machine/model/models.py | 4 ++-- .../model/test_gordo_timeseries_generator.py | 10 ++++++---- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/machine/model/data_frame_mapper.py index ff76a51dd..dd74be58c 100644 --- a/gordo/machine/model/data_frame_mapper.py +++ b/gordo/machine/model/data_frame_mapper.py @@ -6,11 +6,14 @@ class DataFrameMapper(DataFrameMapper): - _default_kwargs = { - "df_out": True - } + _default_kwargs = {"df_out": True} - def __init__(self, columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, **kwargs): + def __init__( + self, + columns: List[Union[str, List[str]]], + classes: Optional[List[dict]] = None, + **kwargs + ): if classes is not None: classes = copy(classes) self._prepare_classes(classes) @@ -23,7 +26,7 @@ def __init__(self, columns: List[Union[str, List[str]]], classes: Optional[List[ def _prepare_classes(classes: List[dict]): for i, v in enumerate(classes): if "class" not in v: - raise ValueError("\"class\" attribute is empty") + raise ValueError('"class" attribute is empty') if isinstance(v["class"], str): cls = locate(v["class"]) classes[i]["class"] = cls diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index e807227f1..6b53adfee 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -540,7 +540,7 @@ def lookahead(self) -> int: @property def timeseries_generator(self): - return self.kwargs.get('timeseries_generator', None) + return self.kwargs.get("timeseries_generator", None) def get_metadata(self): """ @@ -876,7 +876,7 @@ def __init__( length: int, batch_size: int = 128, shuffle: bool = False, - step: Union[pd.Timedelta, str] = '10min', + step: Union[pd.Timedelta, str] = "10min", ): if not isinstance(data, pd.DataFrame): raise ValueError("Data have to be instance of pandas.DataFrame") diff --git a/tests/gordo/machine/model/test_gordo_timeseries_generator.py b/tests/gordo/machine/model/test_gordo_timeseries_generator.py index 6d763fe79..efd32ac64 100644 --- a/tests/gordo/machine/model/test_gordo_timeseries_generator.py +++ b/tests/gordo/machine/model/test_gordo_timeseries_generator.py @@ -56,7 +56,7 @@ def test_find_consecutive_chunks(): ("2018-02-03", 20), ) test1_df = get_test_df(test1_time_intervals) - gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step='60min') + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step="60min") expected_chunks = [ TimeseriesChunk( start_ts=pd.Timestamp("2018-01-01 00:00:00"), @@ -96,7 +96,7 @@ def test_create_generator_containers(): ("2018-01-04", 10), ) test1_df = get_test_df(test1_time_intervals) - gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step='60min') + gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, step="60min") expected_generator_containers = [ { "chunk": TimeseriesChunk( @@ -138,7 +138,9 @@ def test_timeseries_generator(): ("2018-01-04", 10), ) test1_df = get_test_df(test1_time_intervals, generator=range_gen(), tags_count=1) - gen = GordoTimeseriesGenerator(test1_df, test1_df, length=5, batch_size=3, step='60min') + gen = GordoTimeseriesGenerator( + test1_df, test1_df, length=5, batch_size=3, step="60min" + ) assert len(gen.generators_containers) == 2 assert len(gen) == 6 x, y = gen[0] @@ -160,4 +162,4 @@ def test_too_short_timeseries_length(): ) test1_df = get_test_df(test1_time_intervals) with pytest.raises(ValueError): - GordoTimeseriesGenerator(test1_df, test1_df, length=10, step='60min') + GordoTimeseriesGenerator(test1_df, test1_df, length=10, step="60min") From 663e53ee5911b00b725b4b6f417bd52d831e60bc Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 14:04:51 +0300 Subject: [PATCH 54/61] Deal with DataFrameMapper.__set_state__() and DataFrameMapper.__get_state__() --- gordo/machine/model/data_frame_mapper.py | 43 ++++++++++++++++++------ 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/machine/model/data_frame_mapper.py index dd74be58c..8db9d2d04 100644 --- a/gordo/machine/model/data_frame_mapper.py +++ b/gordo/machine/model/data_frame_mapper.py @@ -1,8 +1,12 @@ +import logging + from pydoc import locate from sklearn_pandas import DataFrameMapper, gen_features from copy import copy from typing import List, Union, Optional +logger = logging.getLogger(__name__) + class DataFrameMapper(DataFrameMapper): @@ -14,19 +18,36 @@ def __init__( classes: Optional[List[dict]] = None, **kwargs ): - if classes is not None: - classes = copy(classes) - self._prepare_classes(classes) - features = gen_features(columns=columns, classes=classes) + self.columns = columns + self.classes = classes + features = self._build_features(columns, classes) base_kwargs = copy(self._default_kwargs) base_kwargs.update(kwargs) super().__init__(features=features, **base_kwargs) @staticmethod - def _prepare_classes(classes: List[dict]): - for i, v in enumerate(classes): - if "class" not in v: - raise ValueError('"class" attribute is empty') - if isinstance(v["class"], str): - cls = locate(v["class"]) - classes[i]["class"] = cls + def _build_features( + columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, + ): + if classes is not None: + classes = copy(classes) + for i, v in enumerate(classes): + if "class" not in v: + raise ValueError('"class" attribute is empty') + if isinstance(v["class"], str): + cls = locate(v["class"]) + classes[i]["class"] = cls + logger.debug("_build_features for columns=%s, classes=%s)", columns, classes) + return gen_features(columns=columns, classes=classes) + + def __getstate__(self): + state = super().__getstate__() + state["columns"] = self.columns + state["classes"] = self.classes + del state["features"] + return state + + def __setstate__(self, state): + features = self._build_features(state.get("columns"), state.get("classes")) + state["features"] = features + super().__setstate__(state) From 44d1d4b70f6a656a482893e4840b7b1231a92c3e Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 14:17:02 +0300 Subject: [PATCH 55/61] Some small fixes for DataFrameMapper --- gordo/machine/model/__init__.py | 1 + gordo/machine/model/data_frame_mapper.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/gordo/machine/model/__init__.py b/gordo/machine/model/__init__.py index e69de29bb..fc2c16453 100644 --- a/gordo/machine/model/__init__.py +++ b/gordo/machine/model/__init__.py @@ -0,0 +1 @@ +from .data_frame_mapper import DataFrameMapper \ No newline at end of file diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/machine/model/data_frame_mapper.py index 8db9d2d04..6b9168b1e 100644 --- a/gordo/machine/model/data_frame_mapper.py +++ b/gordo/machine/model/data_frame_mapper.py @@ -2,7 +2,7 @@ from pydoc import locate from sklearn_pandas import DataFrameMapper, gen_features -from copy import copy +from copy import copy, deepcopy from typing import List, Union, Optional logger = logging.getLogger(__name__) @@ -30,7 +30,7 @@ def _build_features( columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, ): if classes is not None: - classes = copy(classes) + classes = deepcopy(classes) for i, v in enumerate(classes): if "class" not in v: raise ValueError('"class" attribute is empty') From 91f8377d5040eaba7631b8752e9371ccfe2531d5 Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 15:25:46 +0300 Subject: [PATCH 56/61] black --- gordo/machine/model/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gordo/machine/model/__init__.py b/gordo/machine/model/__init__.py index fc2c16453..bb2321894 100644 --- a/gordo/machine/model/__init__.py +++ b/gordo/machine/model/__init__.py @@ -1 +1 @@ -from .data_frame_mapper import DataFrameMapper \ No newline at end of file +from .data_frame_mapper import DataFrameMapper From 1598b6000a88d807af864c707bbe23d79d266f25 Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 18:22:26 +0300 Subject: [PATCH 57/61] Fix tests. Add lookahead for TimeseriesGenerator --- gordo/machine/model/data_frame_mapper.py | 8 +++---- gordo/machine/model/models.py | 29 +++++++++++++++++++++--- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/machine/model/data_frame_mapper.py index 6b9168b1e..53b642c31 100644 --- a/gordo/machine/model/data_frame_mapper.py +++ b/gordo/machine/model/data_frame_mapper.py @@ -1,14 +1,14 @@ import logging +import sklearn_pandas from pydoc import locate -from sklearn_pandas import DataFrameMapper, gen_features from copy import copy, deepcopy from typing import List, Union, Optional logger = logging.getLogger(__name__) -class DataFrameMapper(DataFrameMapper): +class DataFrameMapper(sklearn_pandas.DataFrameMapper): _default_kwargs = {"df_out": True} @@ -37,8 +37,8 @@ def _build_features( if isinstance(v["class"], str): cls = locate(v["class"]) classes[i]["class"] = cls - logger.debug("_build_features for columns=%s, classes=%s)", columns, classes) - return gen_features(columns=columns, classes=classes) + logger.debug("_build_features for columns=%s, classes=%s", columns, classes) + return sklearn_pandas.gen_features(columns=columns, classes=classes) def __getstate__(self): state = super().__getstate__() diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index 6b53adfee..a615c0aac 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -809,9 +809,13 @@ def create_keras_timeseriesgenerator( >>> len(gen[0][0][0][0]) # n_features = 2 2 """ - X, y = pad_x_and_y(X, y, lookahead) return timeseries_generators.create_from_config( - config, data=X, targets=y, length=lookback_window, batch_size=batch_size + config, + data=X, + targets=y, + length=lookback_window, + batch_size=batch_size, + lookahead=lookahead, ) @@ -850,7 +854,23 @@ def wrap(cls): return wrap -timeseries_generators = TimeseriesGeneratorTypes(default_type=TimeseriesGenerator) +class DefaultTimeseriesGenertor(TimeseriesGenerator): + def __init__( + self, + data: Union[pd.DataFrame, np.ndarray], + targets: Union[pd.DataFrame, np.ndarray], + lookahead: int = 1, + **kwargs, + ): + if isinstance(data, pd.DataFrame): + data = data.values + if isinstance(targets, pd.DataFrame): + targets = targets.values + data, targets = pad_x_and_y(data, targets, lookahead) + super().__init__(data=data, targets=targets, **kwargs) + + +timeseries_generators = TimeseriesGeneratorTypes(default_type=DefaultTimeseriesGenertor) @dataclass @@ -877,6 +897,7 @@ def __init__( batch_size: int = 128, shuffle: bool = False, step: Union[pd.Timedelta, str] = "10min", + lookahead: int = 1, ): if not isinstance(data, pd.DataFrame): raise ValueError("Data have to be instance of pandas.DataFrame") @@ -910,6 +931,8 @@ def __init__( "Seems like the time series are too small or in random order." "Failed chunks: %s" % self.consecutive_chunks ) + # TODO use lookahead + self.lookahead = lookahead def filter_chunks(self, indexes=None): if indexes is not None: From 49467c9ac77de587f5ee468317187666559165ac Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 18:23:43 +0300 Subject: [PATCH 58/61] typo --- gordo/machine/model/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index a615c0aac..fce77633b 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -854,7 +854,7 @@ def wrap(cls): return wrap -class DefaultTimeseriesGenertor(TimeseriesGenerator): +class DefaultTimeseriesGenerator(TimeseriesGenerator): def __init__( self, data: Union[pd.DataFrame, np.ndarray], @@ -870,7 +870,7 @@ def __init__( super().__init__(data=data, targets=targets, **kwargs) -timeseries_generators = TimeseriesGeneratorTypes(default_type=DefaultTimeseriesGenertor) +timeseries_generators = TimeseriesGeneratorTypes(default_type=DefaultTimeseriesGenerator) @dataclass From 0c6279e635013d16f479ebf51967e9479d857439 Mon Sep 17 00:00:00 2001 From: Serhii Date: Sat, 11 Apr 2020 18:37:38 +0300 Subject: [PATCH 59/61] black --- gordo/machine/model/models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gordo/machine/model/models.py b/gordo/machine/model/models.py index fce77633b..db21035c2 100644 --- a/gordo/machine/model/models.py +++ b/gordo/machine/model/models.py @@ -870,7 +870,9 @@ def __init__( super().__init__(data=data, targets=targets, **kwargs) -timeseries_generators = TimeseriesGeneratorTypes(default_type=DefaultTimeseriesGenerator) +timeseries_generators = TimeseriesGeneratorTypes( + default_type=DefaultTimeseriesGenerator +) @dataclass From 05a91bf58c8827b1e37566fcaf233be55d54da81 Mon Sep 17 00:00:00 2001 From: user Date: Wed, 15 Apr 2020 16:57:37 +0300 Subject: [PATCH 60/61] Steel not working properly --- .../__init__.py} | 25 +++++++------- gordo/machine/model/__init__.py | 1 - gordo/machine/validators.py | 3 ++ gordo/serializer/__init__.py | 2 ++ gordo/serializer/from_definition.py | 33 +++++++++++++++++-- 5 files changed, 49 insertions(+), 15 deletions(-) rename gordo/{machine/model/data_frame_mapper.py => data_frame_mapper/__init__.py} (69%) diff --git a/gordo/machine/model/data_frame_mapper.py b/gordo/data_frame_mapper/__init__.py similarity index 69% rename from gordo/machine/model/data_frame_mapper.py rename to gordo/data_frame_mapper/__init__.py index 53b642c31..73e7bcda9 100644 --- a/gordo/machine/model/data_frame_mapper.py +++ b/gordo/data_frame_mapper/__init__.py @@ -9,14 +9,13 @@ class DataFrameMapper(sklearn_pandas.DataFrameMapper): - _default_kwargs = {"df_out": True} def __init__( - self, - columns: List[Union[str, List[str]]], - classes: Optional[List[dict]] = None, - **kwargs + self, + columns: List[Union[str, List[str]]], + classes: Optional[List[dict]] = None, + **kwargs ): self.columns = columns self.classes = classes @@ -27,16 +26,17 @@ def __init__( @staticmethod def _build_features( - columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, + columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, ): if classes is not None: classes = deepcopy(classes) for i, v in enumerate(classes): - if "class" not in v: - raise ValueError('"class" attribute is empty') - if isinstance(v["class"], str): - cls = locate(v["class"]) - classes[i]["class"] = cls + if isinstance(v, dict): + if "class" not in v: + raise ValueError('"class" attribute is empty') + if isinstance(v["class"], str): + cls = locate(v["class"]) + classes[i]["class"] = cls logger.debug("_build_features for columns=%s, classes=%s", columns, classes) return sklearn_pandas.gen_features(columns=columns, classes=classes) @@ -51,3 +51,6 @@ def __setstate__(self, state): features = self._build_features(state.get("columns"), state.get("classes")) state["features"] = features super().__setstate__(state) + + +__all__ = ['DataFrameMapper'] diff --git a/gordo/machine/model/__init__.py b/gordo/machine/model/__init__.py index bb2321894..e69de29bb 100644 --- a/gordo/machine/model/__init__.py +++ b/gordo/machine/model/__init__.py @@ -1 +0,0 @@ -from .data_frame_mapper import DataFrameMapper diff --git a/gordo/machine/validators.py b/gordo/machine/validators.py index ef6d8a223..7ef248662 100644 --- a/gordo/machine/validators.py +++ b/gordo/machine/validators.py @@ -14,6 +14,8 @@ logger = logging.getLogger(__name__) +logger.debug("from_definition1=%s", from_definition) + class BaseDescriptor: """ @@ -85,6 +87,7 @@ class ValidModel(BaseDescriptor): def __set__(self, instance, value): if getattr(instance, "_strict", True): try: + logger.debug("from_definition=%s", from_definition) from_definition(value) except Exception as e: raise ValueError(f"Pipeline from definition failed: {e}") diff --git a/gordo/serializer/__init__.py b/gordo/serializer/__init__.py index 66285bf10..7e0dd823e 100644 --- a/gordo/serializer/__init__.py +++ b/gordo/serializer/__init__.py @@ -1,3 +1,5 @@ from .from_definition import from_definition, load_params_from_definition from .into_definition import into_definition, load_definition_from_params from .serializer import dump, dumps, load, loads, load_metadata + +__all__=['from_definition', 'into_definition', 'dump', 'dumps', 'load', 'loads', 'load_metadata'] diff --git a/gordo/serializer/from_definition.py b/gordo/serializer/from_definition.py index 7a6898c3d..b769eaba4 100644 --- a/gordo/serializer/from_definition.py +++ b/gordo/serializer/from_definition.py @@ -4,11 +4,13 @@ import pydoc import copy import typing # noqa -from typing import Union, Dict, Any, Iterable +from typing import Union, Dict, Any, Iterable, Type, Optional from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.base import BaseEstimator from tensorflow.keras.models import Sequential +from gordo.data_frame_mapper import DataFrameMapper + logger = logging.getLogger(__name__) @@ -66,7 +68,7 @@ def from_definition( def _build_branch( definition: Iterable[Union[str, Dict[Any, Any]]], - constructor_class=Union[Pipeline, None], + constructor_class: Optional[Type[Pipeline]] = None, ): """ Builds a branch of the tree and optionally constructs the class with the given @@ -177,6 +179,11 @@ def _build_step( f"Got {StepClass} but the supplied parameters" f"seem invalid: {params}" ) + + if issubclass(StepClass, DataFrameMapper): + params = _load_data_mapper_params(params) + + logger.debug("StopClass(%s)", params) return StepClass(**params) # If step is just a string, can initialize it without any params @@ -217,6 +224,16 @@ def _build_callbacks(definitions: list): return callbacks +def _load_data_mapper_params(params: dict): + if "classes" in params: + classes = copy.deepcopy(params["classes"]) + if not isinstance(classes, list): + raise TypeError('"classes" should be a list') + logger.debug("classes=%s", classes) + params["classes"] = _build_branch(classes) + return params + + def _load_param_classes(params: dict): """ Inspect the params' values and determine if any can be loaded as a class. @@ -255,6 +272,7 @@ def _load_param_classes(params: dict): objects """ params = copy.copy(params) + logger.debug("_load_param_classes=%s", params) for key, value in params.items(): # If value is a simple string, try to load the model/class @@ -289,7 +307,16 @@ def _load_param_classes(params: dict): params[key] = from_definition(value) else: # Call this func again, incase there is nested occurances of this problem in these kwargs - kwargs = _load_param_classes(sub_params) + sub_params = value[list(value.keys())[0]] + + if issubclass(Model, DataFrameMapper): + kwargs = _load_data_mapper_params(sub_params) + logger.debug( + "_load_data_mapper_params(%s)=%s", sub_params, kwargs + ) + else: + kwargs = _load_param_classes(sub_params) + params[key] = Model(**kwargs) # type: ignore elif key == "callbacks" and isinstance(value, list): params[key] = _build_callbacks(value) From b7823533681366541d456af6dd32e5e877e18c28 Mon Sep 17 00:00:00 2001 From: user Date: Wed, 15 Apr 2020 17:48:47 +0300 Subject: [PATCH 61/61] Works for simples cases --- gordo/data_frame_mapper/__init__.py | 33 ++++++++++++----------------- gordo/serializer/from_definition.py | 10 ++++----- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/gordo/data_frame_mapper/__init__.py b/gordo/data_frame_mapper/__init__.py index 73e7bcda9..c6b512840 100644 --- a/gordo/data_frame_mapper/__init__.py +++ b/gordo/data_frame_mapper/__init__.py @@ -1,9 +1,9 @@ import logging import sklearn_pandas -from pydoc import locate -from copy import copy, deepcopy -from typing import List, Union, Optional +from copy import copy +from sklearn.base import BaseEstimator +from typing import List, Union logger = logging.getLogger(__name__) @@ -14,41 +14,34 @@ class DataFrameMapper(sklearn_pandas.DataFrameMapper): def __init__( self, columns: List[Union[str, List[str]]], - classes: Optional[List[dict]] = None, + transformers: List[BaseEstimator] = None, **kwargs ): self.columns = columns - self.classes = classes - features = self._build_features(columns, classes) + self.transformers = transformers + features = self._build_features(columns, transformers) base_kwargs = copy(self._default_kwargs) base_kwargs.update(kwargs) super().__init__(features=features, **base_kwargs) @staticmethod def _build_features( - columns: List[Union[str, List[str]]], classes: Optional[List[dict]] = None, + columns: List[Union[str, List[str]]], transformers: List[BaseEstimator], ): - if classes is not None: - classes = deepcopy(classes) - for i, v in enumerate(classes): - if isinstance(v, dict): - if "class" not in v: - raise ValueError('"class" attribute is empty') - if isinstance(v["class"], str): - cls = locate(v["class"]) - classes[i]["class"] = cls - logger.debug("_build_features for columns=%s, classes=%s", columns, classes) - return sklearn_pandas.gen_features(columns=columns, classes=classes) + features = [] + for column in columns: + features.append((column, transformers)) + return features def __getstate__(self): state = super().__getstate__() state["columns"] = self.columns - state["classes"] = self.classes + state["transformers"] = self.transformers del state["features"] return state def __setstate__(self, state): - features = self._build_features(state.get("columns"), state.get("classes")) + features = self._build_features(state.get("columns"), state.get("transformers")) state["features"] = features super().__setstate__(state) diff --git a/gordo/serializer/from_definition.py b/gordo/serializer/from_definition.py index b769eaba4..59581ead6 100644 --- a/gordo/serializer/from_definition.py +++ b/gordo/serializer/from_definition.py @@ -225,12 +225,12 @@ def _build_callbacks(definitions: list): def _load_data_mapper_params(params: dict): - if "classes" in params: - classes = copy.deepcopy(params["classes"]) + if "transformers" in params: + classes = copy.deepcopy(params["transformers"]) if not isinstance(classes, list): - raise TypeError('"classes" should be a list') - logger.debug("classes=%s", classes) - params["classes"] = _build_branch(classes) + raise TypeError('"transformers" should be a list') + logger.debug("transformers=%s", classes) + params["transformers"] = _build_branch(classes) return params