Update threshold calculation for DiffBasedAnomalyDetector (#924)

* Added the aggregate threshold per fold and updated final thresholds * fixed formatting issue & updated the metadata variable name * forgotten variable name update in get_metadata
equinor · Feb 6, 2020 · f664b76 · f664b76
1 parent 1fd4e0b
commit f664b76
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 45 deletions.
diff --git a/gordo/machine/model/anomaly/diff.py b/gordo/machine/model/anomaly/diff.py
@@ -72,7 +72,10 @@ def get_metadata(self):
             metadata[
                 "feature-thresholds-per-fold"
             ] = self.feature_thresholds_per_fold_.to_dict()
-
+        if hasattr(self, "aggregate_thresholds_per_fold_"):
+            metadata[
+                "aggregate-thresholds-per-fold"
+            ] = self.aggregate_thresholds_per_fold_
         if isinstance(self.base_estimator, GordoBase):
             metadata.update(self.base_estimator.get_metadata())
         else:
@@ -128,7 +131,7 @@ def cross_validate(
         cv_output = cross_validate(self, X=X, y=y, **kwargs)
 
         self.feature_thresholds_per_fold_ = pd.DataFrame()
-        scaled_mse_per_timestep = pd.Series()
+        self.aggregate_thresholds_per_fold_ = {}
 
         for i, ((test_idxs, _train_idxs), split_model) in enumerate(
             zip(kwargs["cv"].split(X, y), cv_output["estimator"])
@@ -143,21 +146,20 @@ def cross_validate(
 
             # Model's timestep scaled mse
             scaled_mse = self._scaled_mse_per_timestep(split_model, y_true, y_pred)
-            scaled_mse_per_timestep = pd.concat((scaled_mse_per_timestep, scaled_mse))
-
+            # For the aggregate threshold for the fold model, use the mse of scaled residuals per timestep
+            aggregate_threshold_fold = scaled_mse.rolling(6).min().max()
+            self.aggregate_thresholds_per_fold_[f"fold-{i}"] = aggregate_threshold_fold
             # Accumulate the rolling mins of diffs into common df
             tag_thresholds_fold = self._feature_fold_thresholds(y_true, y_pred, fold=i)
             self.feature_thresholds_per_fold_ = self.feature_thresholds_per_fold_.append(
                 tag_thresholds_fold
             )
 
-        # Calculate the final thresholds per feature based on the previous fold calculations
-        self.feature_thresholds_ = self._final_thresholds(
-            thresholds=self.feature_thresholds_per_fold_
-        )
+        # Final thresholds are the thresholds from the last cv split/fold
+        self.feature_thresholds_ = tag_thresholds_fold
 
-        # For the aggregate, use the accumulated mse of scaled residuals per timestep
-        self.aggregate_threshold_ = scaled_mse_per_timestep.rolling(6).min().max()
+        # For the aggregate also use the thresholds from the last split/fold
+        self.aggregate_threshold_ = aggregate_threshold_fold
         return cv_output
 
     @staticmethod
@@ -211,26 +213,6 @@ def _feature_fold_thresholds(
         diff.name = f"fold-{fold}"
         return diff
 
-    @staticmethod
-    def _final_thresholds(thresholds: pd.DataFrame) -> pd.Series:
-        """
-        Calculate the aggregate and final thresholds from previously
-        calculated fold thresholds.
-
-        Parameters
-        ----------
-        thresholds: pd.DataFrame
-            Aggregate thresholds from previous folds.
-
-        Returns
-        -------
-        pd.Series
-            Per feature calculated final thresholds over the fold thresholds
-        """
-        final_thresholds = thresholds.mean()
-        final_thresholds.name = "thresholds"
-        return final_thresholds
-
     def anomaly(
         self, X: pd.DataFrame, y: pd.DataFrame, frequency: Optional[timedelta] = None
     ) -> pd.DataFrame:

diff --git a/tests/gordo/machine/model/anomaly/test_anomaly_detectors.py b/tests/gordo/machine/model/anomaly/test_anomaly_detectors.py
@@ -161,13 +161,15 @@ def test_diff_detector_threshold(n_features_y: int, n_features_x: int):
     assert not hasattr(model, "feature_thresholds_")
     assert not hasattr(model, "aggregate_threshold_")
     assert not hasattr(model, "feature_thresholds_per_fold_")
+    assert not hasattr(model, "aggregate_thresholds_per_fold_")
 
     model.fit(X, y)
 
     # Until it has done cross validation, it has no threshold.
     assert not hasattr(model, "feature_thresholds_")
     assert not hasattr(model, "aggregate_threshold_")
     assert not hasattr(model, "feature_thresholds_per_fold_")
+    assert not hasattr(model, "aggregate_thresholds_per_fold_")
 
     # Calling cross validate should set the threshold for it.
     model.cross_validate(X=X, y=y)
@@ -176,10 +178,12 @@ def test_diff_detector_threshold(n_features_y: int, n_features_x: int):
     assert hasattr(model, "feature_thresholds_")
     assert hasattr(model, "aggregate_threshold_")
     assert hasattr(model, "feature_thresholds_per_fold_")
+    assert hasattr(model, "aggregate_thresholds_per_fold_")
     assert isinstance(model.feature_thresholds_, pd.Series)
     assert len(model.feature_thresholds_) == y.shape[1]
     assert all(model.feature_thresholds_.notna())
     assert isinstance(model.feature_thresholds_per_fold_, pd.DataFrame)
+    assert isinstance(model.aggregate_thresholds_per_fold_, dict)
 
 
 @pytest.mark.parametrize("return_estimator", (True, False))
@@ -226,21 +230,6 @@ def test_diff_detector_fold_thresholds(y_pred_shape: tuple, y_true_shape: tuple)
     assert output.name == "fold-1"
 
 
-def test_diff_detector_final_thresholds():
-    """
-    Final thresholds is simply calculated as the mean of
-    previously calculated fold thresholds
-    """
-    thresholds = pd.DataFrame(np.random.random((4, 2)))  # 4 folds and 2 features
-
-    expected = thresholds.mean()
-    output = DiffBasedAnomalyDetector._final_thresholds(thresholds=thresholds)
-    assert isinstance(output, pd.Series)
-    assert len(output) == 2  # equal to number of features
-    assert np.allclose(expected.values, output.values)
-    assert output.name == "thresholds"
-
-
 @pytest.mark.parametrize("require_threshold", (True, False))
 def test_diff_detector_require_thresholds(require_threshold: bool):
     """