diff --git a/gordo/machine/model/anomaly/diff.py b/gordo/machine/model/anomaly/diff.py index c2a88198b..f92d92ae4 100644 --- a/gordo/machine/model/anomaly/diff.py +++ b/gordo/machine/model/anomaly/diff.py @@ -72,7 +72,10 @@ def get_metadata(self): metadata[ "feature-thresholds-per-fold" ] = self.feature_thresholds_per_fold_.to_dict() - + if hasattr(self, "aggregate_thresholds_per_fold_"): + metadata[ + "aggregate-thresholds-per-fold" + ] = self.aggregate_thresholds_per_fold_ if isinstance(self.base_estimator, GordoBase): metadata.update(self.base_estimator.get_metadata()) else: @@ -128,7 +131,7 @@ def cross_validate( cv_output = cross_validate(self, X=X, y=y, **kwargs) self.feature_thresholds_per_fold_ = pd.DataFrame() - scaled_mse_per_timestep = pd.Series() + self.aggregate_thresholds_per_fold_ = {} for i, ((test_idxs, _train_idxs), split_model) in enumerate( zip(kwargs["cv"].split(X, y), cv_output["estimator"]) @@ -143,21 +146,20 @@ def cross_validate( # Model's timestep scaled mse scaled_mse = self._scaled_mse_per_timestep(split_model, y_true, y_pred) - scaled_mse_per_timestep = pd.concat((scaled_mse_per_timestep, scaled_mse)) - + # For the aggregate threshold for the fold model, use the mse of scaled residuals per timestep + aggregate_threshold_fold = scaled_mse.rolling(6).min().max() + self.aggregate_thresholds_per_fold_[f"fold-{i}"] = aggregate_threshold_fold # Accumulate the rolling mins of diffs into common df tag_thresholds_fold = self._feature_fold_thresholds(y_true, y_pred, fold=i) self.feature_thresholds_per_fold_ = self.feature_thresholds_per_fold_.append( tag_thresholds_fold ) - # Calculate the final thresholds per feature based on the previous fold calculations - self.feature_thresholds_ = self._final_thresholds( - thresholds=self.feature_thresholds_per_fold_ - ) + # Final thresholds are the thresholds from the last cv split/fold + self.feature_thresholds_ = tag_thresholds_fold - # For the aggregate, use the accumulated mse of scaled residuals per timestep - self.aggregate_threshold_ = scaled_mse_per_timestep.rolling(6).min().max() + # For the aggregate also use the thresholds from the last split/fold + self.aggregate_threshold_ = aggregate_threshold_fold return cv_output @staticmethod @@ -211,26 +213,6 @@ def _feature_fold_thresholds( diff.name = f"fold-{fold}" return diff - @staticmethod - def _final_thresholds(thresholds: pd.DataFrame) -> pd.Series: - """ - Calculate the aggregate and final thresholds from previously - calculated fold thresholds. - - Parameters - ---------- - thresholds: pd.DataFrame - Aggregate thresholds from previous folds. - - Returns - ------- - pd.Series - Per feature calculated final thresholds over the fold thresholds - """ - final_thresholds = thresholds.mean() - final_thresholds.name = "thresholds" - return final_thresholds - def anomaly( self, X: pd.DataFrame, y: pd.DataFrame, frequency: Optional[timedelta] = None ) -> pd.DataFrame: diff --git a/tests/gordo/machine/model/anomaly/test_anomaly_detectors.py b/tests/gordo/machine/model/anomaly/test_anomaly_detectors.py index d92a103fd..edeb3dd8e 100644 --- a/tests/gordo/machine/model/anomaly/test_anomaly_detectors.py +++ b/tests/gordo/machine/model/anomaly/test_anomaly_detectors.py @@ -161,6 +161,7 @@ def test_diff_detector_threshold(n_features_y: int, n_features_x: int): assert not hasattr(model, "feature_thresholds_") assert not hasattr(model, "aggregate_threshold_") assert not hasattr(model, "feature_thresholds_per_fold_") + assert not hasattr(model, "aggregate_thresholds_per_fold_") model.fit(X, y) @@ -168,6 +169,7 @@ def test_diff_detector_threshold(n_features_y: int, n_features_x: int): assert not hasattr(model, "feature_thresholds_") assert not hasattr(model, "aggregate_threshold_") assert not hasattr(model, "feature_thresholds_per_fold_") + assert not hasattr(model, "aggregate_thresholds_per_fold_") # Calling cross validate should set the threshold for it. model.cross_validate(X=X, y=y) @@ -176,10 +178,12 @@ def test_diff_detector_threshold(n_features_y: int, n_features_x: int): assert hasattr(model, "feature_thresholds_") assert hasattr(model, "aggregate_threshold_") assert hasattr(model, "feature_thresholds_per_fold_") + assert hasattr(model, "aggregate_thresholds_per_fold_") assert isinstance(model.feature_thresholds_, pd.Series) assert len(model.feature_thresholds_) == y.shape[1] assert all(model.feature_thresholds_.notna()) assert isinstance(model.feature_thresholds_per_fold_, pd.DataFrame) + assert isinstance(model.aggregate_thresholds_per_fold_, dict) @pytest.mark.parametrize("return_estimator", (True, False)) @@ -226,21 +230,6 @@ def test_diff_detector_fold_thresholds(y_pred_shape: tuple, y_true_shape: tuple) assert output.name == "fold-1" -def test_diff_detector_final_thresholds(): - """ - Final thresholds is simply calculated as the mean of - previously calculated fold thresholds - """ - thresholds = pd.DataFrame(np.random.random((4, 2))) # 4 folds and 2 features - - expected = thresholds.mean() - output = DiffBasedAnomalyDetector._final_thresholds(thresholds=thresholds) - assert isinstance(output, pd.Series) - assert len(output) == 2 # equal to number of features - assert np.allclose(expected.values, output.values) - assert output.name == "thresholds" - - @pytest.mark.parametrize("require_threshold", (True, False)) def test_diff_detector_require_thresholds(require_threshold: bool): """