From 4409cc5f9a2644b75342a40a743bb22a7d128edc Mon Sep 17 00:00:00 2001 From: Kevin Klein <7267523+kklein@users.noreply.github.com> Date: Mon, 12 Aug 2024 16:58:53 +0200 Subject: [PATCH] Fix S-Learner's leakage (#79) * Update benchmark values. * Fix S-Learner's leakage. * Add changelog entry. * Fix date in changelog. --- CHANGELOG.rst | 8 ++++++++ benchmarks/readme.md | 12 ++++++------ metalearners/slearner.py | 35 +++-------------------------------- 3 files changed, 17 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1ddf9b2e..d56f4106 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,14 @@ Changelog ========= +0.9.1 (2024-08-xx) +------------------ + +**Bug fixes** + +* Fix bug in which the :class:`~metalearners.slearner.SLearner`'s + inference step would have some leakage in the in-sample scenario. + 0.9.0 (2024-08-02) ------------------ diff --git a/benchmarks/readme.md b/benchmarks/readme.md index b23a5ee2..e91a14c2 100644 --- a/benchmarks/readme.md +++ b/benchmarks/readme.md @@ -32,12 +32,12 @@ on ground truth CATEs: | S-learner | causalml_in_sample | causalml_oos | econml_in_sample | econml_oos | metalearners_in_sample | metalearners_oos | | :------------------------------------------------------------ | -----------------: | -----------: | ---------------: | ---------: | ---------------------: | ---------------: | -| synthetic_data_continuous_outcome_binary_treatment_linear_te | 14.5706 | 14.6248 | 14.5706 | 14.6248 | 14.5729 | 14.6248 | -| synthetic_data_binary_outcome_binary_treatment_linear_te | 0.229101 | 0.228616 | nan | nan | 0.229231 | 0.2286 | -| twins_pandas | 0.314253 | 0.318554 | nan | nan | 0.371613 | 0.319028 | -| twins_numpy | 0.314253 | 0.318554 | nan | nan | 0.361345 | 0.318554 | -| synthetic_data_continuous_outcome_multi_treatment_linear_te | nan | nan | 14.1468 | 14.185 | 14.1478 | 14.1853 | -| synthetic_data_continuous_outcome_multi_treatment_constant_te | nan | nan | 0.0110779 | 0.0110778 | 0.0104649 | 0.00897915 | +| synthetic_data_continuous_outcome_binary_treatment_linear_te | 14.5706 | 14.6248 | 14.5706 | 14.6248 | 14.5707 | 14.6248 | +| synthetic_data_binary_outcome_binary_treatment_linear_te | 0.229101 | 0.228616 | nan | nan | 0.229201 | 0.2286 | +| twins_pandas | 0.314253 | 0.318554 | nan | nan | 0.322171 | 0.319028 | +| twins_numpy | 0.314253 | 0.318554 | nan | nan | 0.322132 | 0.318554 | +| synthetic_data_continuous_outcome_multi_treatment_linear_te | nan | nan | 14.1468 | 14.185 | 14.147 | 14.1853 | +| synthetic_data_continuous_outcome_multi_treatment_constant_te | nan | nan | 0.0110779 | 0.0110778 | 0.0101122 | 0.00897915 | | X-learner | causalml_in_sample | causalml_oos | econml_in_sample | econml_oos | metalearners_in_sample | metalearners_oos | | :------------------------------------------------------------ | -----------------: | -----------: | ---------------: | ---------: | ---------------------: | ---------------: | diff --git a/metalearners/slearner.py b/metalearners/slearner.py index e636c0f2..dde4c8ea 100644 --- a/metalearners/slearner.py +++ b/metalearners/slearner.py @@ -251,33 +251,8 @@ def predict_conditional_average_outcomes( n_obs = len(X) conditional_average_outcomes_list = [] - # The idea behind using is_oos = True for in sample predictions is the following: - # Assuming observation i has received variant v then the model has been trained - # on row (X_i, v), therefore when predicting the conditional average outcome for - # variant v we have to use cross fitting to avoid prediciting on an identical row - # which the model has been trained on. (This happens either with overall, mean - # or median as some of the models would be trained with this row). On the other - # hand, when predicting the conditional average outcome for variant v' != v, - # the model has never seen the row (X_i, v'), so we can use it as it was out of - # sample. - # This can bring some issues where the cross fitted predictions are based on models - # which have been trained with a smaller dataset (K-1 folds) than the overall - # model and this may produce some different distributions in the outputs, for this - # it may make sense to restrict the oos_method to mean or median when is_oos = False, - # although further investigation is needed. - if not is_oos: - X_with_w = _append_treatment_to_covariates( - X, - self._fitted_treatments, - self._supports_categoricals, - self.n_variants, - ) - in_sample_pred = self.predict_nuisance( - X=X_with_w, model_kind=_BASE_MODEL, model_ord=0, is_oos=False - ) - - for v in range(self.n_variants): - w = np.array([v] * n_obs) + for treatment_variant in range(self.n_variants): + w = np.array([treatment_variant] * n_obs) X_with_w = _append_treatment_to_covariates( X, w, self._supports_categoricals, self.n_variants ) @@ -285,13 +260,9 @@ def predict_conditional_average_outcomes( X=X_with_w, model_kind=_BASE_MODEL, model_ord=0, - is_oos=True, + is_oos=is_oos, oos_method=oos_method, ) - if not is_oos: - variant_predictions[self._fitted_treatments == v] = in_sample_pred[ - self._fitted_treatments == v - ] conditional_average_outcomes_list.append(variant_predictions)