Skip to content

Commit

Permalink
Update threshold calculation for DiffBasedAnomalyDetector (#924)
Browse files Browse the repository at this point in the history
* Added the aggregate threshold per fold and updated final thresholds

* fixed formatting issue & updated the metadata variable name

* forgotten variable name update in get_metadata
  • Loading branch information
Choukha Ram authored Feb 6, 2020
1 parent 1fd4e0b commit f664b76
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 45 deletions.
42 changes: 12 additions & 30 deletions gordo/machine/model/anomaly/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,10 @@ def get_metadata(self):
metadata[
"feature-thresholds-per-fold"
] = self.feature_thresholds_per_fold_.to_dict()

if hasattr(self, "aggregate_thresholds_per_fold_"):
metadata[
"aggregate-thresholds-per-fold"
] = self.aggregate_thresholds_per_fold_
if isinstance(self.base_estimator, GordoBase):
metadata.update(self.base_estimator.get_metadata())
else:
Expand Down Expand Up @@ -128,7 +131,7 @@ def cross_validate(
cv_output = cross_validate(self, X=X, y=y, **kwargs)

self.feature_thresholds_per_fold_ = pd.DataFrame()
scaled_mse_per_timestep = pd.Series()
self.aggregate_thresholds_per_fold_ = {}

for i, ((test_idxs, _train_idxs), split_model) in enumerate(
zip(kwargs["cv"].split(X, y), cv_output["estimator"])
Expand All @@ -143,21 +146,20 @@ def cross_validate(

# Model's timestep scaled mse
scaled_mse = self._scaled_mse_per_timestep(split_model, y_true, y_pred)
scaled_mse_per_timestep = pd.concat((scaled_mse_per_timestep, scaled_mse))

# For the aggregate threshold for the fold model, use the mse of scaled residuals per timestep
aggregate_threshold_fold = scaled_mse.rolling(6).min().max()
self.aggregate_thresholds_per_fold_[f"fold-{i}"] = aggregate_threshold_fold
# Accumulate the rolling mins of diffs into common df
tag_thresholds_fold = self._feature_fold_thresholds(y_true, y_pred, fold=i)
self.feature_thresholds_per_fold_ = self.feature_thresholds_per_fold_.append(
tag_thresholds_fold
)

# Calculate the final thresholds per feature based on the previous fold calculations
self.feature_thresholds_ = self._final_thresholds(
thresholds=self.feature_thresholds_per_fold_
)
# Final thresholds are the thresholds from the last cv split/fold
self.feature_thresholds_ = tag_thresholds_fold

# For the aggregate, use the accumulated mse of scaled residuals per timestep
self.aggregate_threshold_ = scaled_mse_per_timestep.rolling(6).min().max()
# For the aggregate also use the thresholds from the last split/fold
self.aggregate_threshold_ = aggregate_threshold_fold
return cv_output

@staticmethod
Expand Down Expand Up @@ -211,26 +213,6 @@ def _feature_fold_thresholds(
diff.name = f"fold-{fold}"
return diff

@staticmethod
def _final_thresholds(thresholds: pd.DataFrame) -> pd.Series:
"""
Calculate the aggregate and final thresholds from previously
calculated fold thresholds.
Parameters
----------
thresholds: pd.DataFrame
Aggregate thresholds from previous folds.
Returns
-------
pd.Series
Per feature calculated final thresholds over the fold thresholds
"""
final_thresholds = thresholds.mean()
final_thresholds.name = "thresholds"
return final_thresholds

def anomaly(
self, X: pd.DataFrame, y: pd.DataFrame, frequency: Optional[timedelta] = None
) -> pd.DataFrame:
Expand Down
19 changes: 4 additions & 15 deletions tests/gordo/machine/model/anomaly/test_anomaly_detectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,15 @@ def test_diff_detector_threshold(n_features_y: int, n_features_x: int):
assert not hasattr(model, "feature_thresholds_")
assert not hasattr(model, "aggregate_threshold_")
assert not hasattr(model, "feature_thresholds_per_fold_")
assert not hasattr(model, "aggregate_thresholds_per_fold_")

model.fit(X, y)

# Until it has done cross validation, it has no threshold.
assert not hasattr(model, "feature_thresholds_")
assert not hasattr(model, "aggregate_threshold_")
assert not hasattr(model, "feature_thresholds_per_fold_")
assert not hasattr(model, "aggregate_thresholds_per_fold_")

# Calling cross validate should set the threshold for it.
model.cross_validate(X=X, y=y)
Expand All @@ -176,10 +178,12 @@ def test_diff_detector_threshold(n_features_y: int, n_features_x: int):
assert hasattr(model, "feature_thresholds_")
assert hasattr(model, "aggregate_threshold_")
assert hasattr(model, "feature_thresholds_per_fold_")
assert hasattr(model, "aggregate_thresholds_per_fold_")
assert isinstance(model.feature_thresholds_, pd.Series)
assert len(model.feature_thresholds_) == y.shape[1]
assert all(model.feature_thresholds_.notna())
assert isinstance(model.feature_thresholds_per_fold_, pd.DataFrame)
assert isinstance(model.aggregate_thresholds_per_fold_, dict)


@pytest.mark.parametrize("return_estimator", (True, False))
Expand Down Expand Up @@ -226,21 +230,6 @@ def test_diff_detector_fold_thresholds(y_pred_shape: tuple, y_true_shape: tuple)
assert output.name == "fold-1"


def test_diff_detector_final_thresholds():
"""
Final thresholds is simply calculated as the mean of
previously calculated fold thresholds
"""
thresholds = pd.DataFrame(np.random.random((4, 2))) # 4 folds and 2 features

expected = thresholds.mean()
output = DiffBasedAnomalyDetector._final_thresholds(thresholds=thresholds)
assert isinstance(output, pd.Series)
assert len(output) == 2 # equal to number of features
assert np.allclose(expected.values, output.values)
assert output.name == "thresholds"


@pytest.mark.parametrize("require_threshold", (True, False))
def test_diff_detector_require_thresholds(require_threshold: bool):
"""
Expand Down

0 comments on commit f664b76

Please sign in to comment.