From 5ddf5533aebaedbceee53d53ef8fd9ca8b7d70ee Mon Sep 17 00:00:00 2001
From: Apoorva Lal <apoorval@netflix.com>
Date: Tue, 6 Aug 2024 10:33:07 -0700
Subject: [PATCH] Revert "Move fixed propensity model to utils. (#72)"

This reverts commit 3b2777c101eb382593164e2db8401c3acca75fd8.
---
 CHANGELOG.rst                          |  6 +--
 docs/examples/example_propensity.ipynb | 66 ++++++++++++++++----------
 metalearners/utils.py                  | 32 -------------
 tests/test_utils.py                    | 61 +-----------------------
 4 files changed, 45 insertions(+), 120 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index a89a0839..c5b1a50e 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -12,9 +12,7 @@ Changelog
 
 **New features**
 
-* Add :meth:`metalearners.metalearner.MetaLearner.init_params`.
-
-* Add :class:`metalearners.utils.FixedBinaryPropensity`.
+* Added :meth:`metalearners.metalearner.MetaLearner.init_params`.
 
 * Added :meth:`metalearners.metalearner.DRLearner.treatment_effect` to compute AIPW point estimate and standard error for _average treatment effects (ATE)_ without requiring a full model fit (which is required for CATE estimation). A new notebook contains examples.
 
@@ -23,7 +21,7 @@ Changelog
 
 **New features**
 
-* Add :meth:`metalearners.metalearner.MetaLearner.fit_all_nuisance` and
+* Added :meth:`metalearners.metalearner.MetaLearner.fit_all_nuisance` and
   :meth:`metalearners.metalearner.MetaLearner.fit_all_treatment`.
 
 * Add optional ``store_raw_results`` and ``store_results`` parameters to :class:`metalearners.grid_search.MetaLearnerGridSearch`.
diff --git a/docs/examples/example_propensity.ipynb b/docs/examples/example_propensity.ipynb
index 7fc280f8..3a356351 100644
--- a/docs/examples/example_propensity.ipynb
+++ b/docs/examples/example_propensity.ipynb
@@ -59,8 +59,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Using a dummy estimator\n",
-    "-----------------------\n",
+    "Creating our own estimator\n",
+    "--------------------------\n",
     "\n",
     "In this tutorial we will assume that we know that all observations were assigned to the\n",
     "treatment with a fixed probability of 0.3, which is close to the fraction of the observations\n",
@@ -89,10 +89,43 @@
     "dataset, we just use it for illustrational purposes.\n",
     "```\n",
     "\n",
-    "Now we can use a custom ``sklearn``-like classifier: {class}`~metalearners.utils.FixedBinaryPropensity`.\n",
-    "The latter can be used like any ``sklearn`` classifier but will always return the same propensity,\n",
-    "independently of the observed covariates. This propensity has to be provided at initialization via the\n",
-    "``propensity_score`` parameter."
+    "Now we can define our custom ``sklearn``-like classifier. We recommend inheriting from\n",
+    "the ``sklearn`` base classes and following the rules explained in the\n",
+    "[sklearn documentation](https://scikit-learn.org/stable/developers/develop.html) to avoid\n",
+    "having to define helper functions and ensure the correct functionality of the ``metalearners``\n",
+    "library."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.base import BaseEstimator, ClassifierMixin\n",
+    "from typing import Any\n",
+    "from typing_extensions import Self\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "\n",
+    "class FixedPropensityModel(ClassifierMixin, BaseEstimator):\n",
+    "    def __init__(self, propensity_score: float) -> None:\n",
+    "        self.propensity_score = propensity_score\n",
+    "\n",
+    "    def fit(self, X: pd.DataFrame, y: pd.Series) -> Self:\n",
+    "        self.classes_ = np.unique(y.to_numpy())  # sklearn requires this\n",
+    "        return self\n",
+    "\n",
+    "    def predict(self, X: pd.DataFrame) -> np.ndarray[Any, Any]:\n",
+    "        return np.argmax(self.predict_proba(X), axis=1)\n",
+    "\n",
+    "    def predict_proba(self, X: pd.DataFrame) -> np.ndarray[Any, Any]:\n",
+    "        return np.full((len(X), 2), [1 - self.propensity_score, self.propensity_score])"
    ]
   },
   {
@@ -116,12 +149,11 @@
    "outputs": [],
    "source": [
     "from metalearners import RLearner\n",
-    "from metalearners.utils import FixedBinaryPropensity\n",
     "from lightgbm import LGBMRegressor\n",
     "\n",
     "rlearner = RLearner(\n",
     "    nuisance_model_factory=LGBMRegressor,\n",
-    "    propensity_model_factory=FixedBinaryPropensity,\n",
+    "    propensity_model_factory=FixedPropensityModel,\n",
     "    treatment_model_factory=LGBMRegressor,\n",
     "    nuisance_model_params={\"verbose\": -1},\n",
     "    propensity_model_params={\"propensity_score\": 0.3},\n",
@@ -173,24 +205,10 @@
   }
  ],
  "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "name": "python"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 2
 }
diff --git a/metalearners/utils.py b/metalearners/utils.py
index 587bef67..765e6c1d 100644
--- a/metalearners/utils.py
+++ b/metalearners/utils.py
@@ -1,14 +1,8 @@
 # Copyright (c) QuantCo 2024-2024
 # SPDX-License-Identifier: BSD-3-Clause
 
-from typing import Any
-
 import numpy as np
-import pandas as pd
-from sklearn.base import BaseEstimator, ClassifierMixin
-from typing_extensions import Self
 
-from metalearners._typing import Matrix, Vector
 from metalearners.drlearner import DRLearner
 from metalearners.metalearner import MetaLearner
 from metalearners.rlearner import RLearner
@@ -79,29 +73,3 @@ def simplify_output(tensor: np.ndarray) -> np.ndarray:
     if n_outputs == 2:
         return tensor[:, :, 1].reshape(n_obs, n_variants)
     return tensor
-
-
-class FixedBinaryPropensity(ClassifierMixin, BaseEstimator):
-    """Binary classifier propensity dummy model which outputs a fixed propensity,
-    independently of covariates."""
-
-    def __init__(self, propensity_score: float) -> None:
-        if not 0 <= propensity_score <= 1:
-            raise ValueError(
-                f"Expected a propensity score between 0 and 1 but got {propensity_score}."
-            )
-        self.propensity_score = propensity_score
-
-    def fit(self, X: Matrix, y: Vector) -> Self:
-        self.classes_ = np.unique(y)  # sklearn requires this
-        if (n_classes := len(self.classes_)) > 2:
-            raise ValueError(
-                f"FixedBinaryPropensityModel only supports binary outcomes but {n_classes} were provided ."
-            )
-        return self
-
-    def predict(self, X: Matrix) -> np.ndarray[Any, Any]:
-        return np.argmax(self.predict_proba(X), axis=1)
-
-    def predict_proba(self, X: pd.DataFrame) -> np.ndarray[Any, Any]:
-        return np.full((len(X), 2), [1 - self.propensity_score, self.propensity_score])
diff --git a/tests/test_utils.py b/tests/test_utils.py
index d88b5d14..c3f334ec 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,16 +2,11 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import numpy as np
-import pandas as pd
 import pytest
 from lightgbm import LGBMRegressor
 
 from metalearners.metalearner import MetaLearner
-from metalearners.utils import (
-    FixedBinaryPropensity,
-    metalearner_factory,
-    simplify_output,
-)
+from metalearners.utils import metalearner_factory, simplify_output
 
 
 @pytest.mark.parametrize("prefix", ["T"])
@@ -57,57 +52,3 @@ def test_simplify_output(input, expected):
 def test_simplify_output_raises(input):
     with pytest.raises(ValueError, match="needs to be 3-dimensional"):
         simplify_output(input)
-
-
-@pytest.mark.parametrize("use_pd", [True, False])
-def test_fixed_binary_propensity(use_pd):
-    propensity_score = 0.3
-    dominant_class = propensity_score >= 0.5
-
-    model = FixedBinaryPropensity(propensity_score=propensity_score)
-
-    n_samples = 5
-    X_train = np.ones((n_samples, 5))
-    y_train = np.ones(n_samples)
-    if use_pd:
-        X_train = pd.DataFrame(X_train)
-        y_train = pd.Series(y_train)
-
-    model.fit(X_train, y_train)
-
-    n_test_samples = 3
-    X_test = np.zeros(n_test_samples)
-
-    class_predictions = model.predict(X_test)
-    assert np.array_equal(
-        class_predictions, np.array(np.ones(n_test_samples) * dominant_class)
-    )
-
-    probability_estimates = model.predict_proba(X_test)
-    assert np.array_equal(
-        probability_estimates,
-        np.column_stack(
-            (
-                np.ones(n_test_samples) * (1 - propensity_score),
-                np.ones(n_test_samples) * propensity_score,
-            )
-        ),
-    )
-
-
-@pytest.mark.parametrize("propensity_score", [-1, 100, 1.1])
-def test_fixed_binary_propensity_not_a_propbability(propensity_score):
-    with pytest.raises(ValueError, match="between 0 and 1 but got"):
-        FixedBinaryPropensity(propensity_score=propensity_score)
-
-
-def test_fixed_binary_propensity_non_binary():
-    propensity_score = 0.3
-
-    model = FixedBinaryPropensity(propensity_score=propensity_score)
-
-    n_samples = 5
-    X_train = np.ones((n_samples, 5))
-    y_train = np.fromiter(range(n_samples), dtype=int)
-    with pytest.raises(ValueError, match="only supports binary outcomes"):
-        model.fit(X_train, y_train)