Merge branch 'main' into onnx

Quantco · Aug 2, 2024 · fced9a6 · fced9a6
2 parents a8ecb2d + 6d8bc68
commit fced9a6
Show file tree

Hide file tree

Showing 4 changed files with 120 additions and 45 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -12,7 +12,9 @@ Changelog
 
 **New features**
 
-* Added :meth:`metalearners.metalearner.MetaLearner.init_params`.
+* Add :meth:`metalearners.metalearner.MetaLearner.init_params`.
+
+* Add :class:`metalearners.utils.FixedBinaryPropensity`.
 
 * Added ``_build_onnx`` to :class:`metalearners.MetaLearner` abstract class and implement it
   for :class:`metalearners.TLearner`, :class:`metalearners.XLearner`, :class:`metalearners.RLearner`
@@ -25,7 +27,7 @@ Changelog
 
 **New features**
 
-* Added :meth:`metalearners.metalearner.MetaLearner.fit_all_nuisance` and
+* Add :meth:`metalearners.metalearner.MetaLearner.fit_all_nuisance` and
   :meth:`metalearners.metalearner.MetaLearner.fit_all_treatment`.
 
 * Add optional ``store_raw_results`` and ``store_results`` parameters to :class:`metalearners.grid_search.MetaLearnerGridSearch`.

diff --git a/docs/examples/example_propensity.ipynb b/docs/examples/example_propensity.ipynb
@@ -59,8 +59,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Creating our own estimator\n",
-    "--------------------------\n",
+    "Using a dummy estimator\n",
+    "-----------------------\n",
     "\n",
     "In this tutorial we will assume that we know that all observations were assigned to the\n",
     "treatment with a fixed probability of 0.3, which is close to the fraction of the observations\n",
@@ -89,43 +89,10 @@
     "dataset, we just use it for illustrational purposes.\n",
     "```\n",
     "\n",
-    "Now we can define our custom ``sklearn``-like classifier. We recommend inheriting from\n",
-    "the ``sklearn`` base classes and following the rules explained in the\n",
-    "[sklearn documentation](https://scikit-learn.org/stable/developers/develop.html) to avoid\n",
-    "having to define helper functions and ensure the correct functionality of the ``metalearners``\n",
-    "library."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "from sklearn.base import BaseEstimator, ClassifierMixin\n",
-    "from typing import Any\n",
-    "from typing_extensions import Self\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "\n",
-    "\n",
-    "class FixedPropensityModel(ClassifierMixin, BaseEstimator):\n",
-    "    def __init__(self, propensity_score: float) -> None:\n",
-    "        self.propensity_score = propensity_score\n",
-    "\n",
-    "    def fit(self, X: pd.DataFrame, y: pd.Series) -> Self:\n",
-    "        self.classes_ = np.unique(y.to_numpy())  # sklearn requires this\n",
-    "        return self\n",
-    "\n",
-    "    def predict(self, X: pd.DataFrame) -> np.ndarray[Any, Any]:\n",
-    "        return np.argmax(self.predict_proba(X), axis=1)\n",
-    "\n",
-    "    def predict_proba(self, X: pd.DataFrame) -> np.ndarray[Any, Any]:\n",
-    "        return np.full((len(X), 2), [1 - self.propensity_score, self.propensity_score])"
+    "Now we can use a custom ``sklearn``-like classifier: {class}`~metalearners.utils.FixedBinaryPropensity`.\n",
+    "The latter can be used like any ``sklearn`` classifier but will always return the same propensity,\n",
+    "independently of the observed covariates. This propensity has to be provided at initialization via the\n",
+    "``propensity_score`` parameter."
    ]
   },
   {
@@ -149,11 +116,12 @@
    "outputs": [],
    "source": [
     "from metalearners import RLearner\n",
+    "from metalearners.utils import FixedBinaryPropensity\n",
     "from lightgbm import LGBMRegressor\n",
     "\n",
     "rlearner = RLearner(\n",
     "    nuisance_model_factory=LGBMRegressor,\n",
-    "    propensity_model_factory=FixedPropensityModel,\n",
+    "    propensity_model_factory=FixedBinaryPropensity,\n",
     "    treatment_model_factory=LGBMRegressor,\n",
     "    nuisance_model_params={\"verbose\": -1},\n",
     "    propensity_model_params={\"propensity_score\": 0.3},\n",
@@ -205,10 +173,24 @@
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/metalearners/utils.py b/metalearners/utils.py
@@ -1,8 +1,14 @@
 # Copyright (c) QuantCo 2024-2024
 # SPDX-License-Identifier: BSD-3-Clause
 
+from typing import Any
+
 import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, ClassifierMixin
+from typing_extensions import Self
 
+from metalearners._typing import Matrix, Vector
 from metalearners.drlearner import DRLearner
 from metalearners.metalearner import MetaLearner
 from metalearners.rlearner import RLearner
@@ -73,3 +79,29 @@ def simplify_output(tensor: np.ndarray) -> np.ndarray:
     if n_outputs == 2:
         return tensor[:, :, 1].reshape(n_obs, n_variants)
     return tensor
+
+
+class FixedBinaryPropensity(ClassifierMixin, BaseEstimator):
+    """Binary classifier propensity dummy model which outputs a fixed propensity,
+    independently of covariates."""
+
+    def __init__(self, propensity_score: float) -> None:
+        if not 0 <= propensity_score <= 1:
+            raise ValueError(
+                f"Expected a propensity score between 0 and 1 but got {propensity_score}."
+            )
+        self.propensity_score = propensity_score
+
+    def fit(self, X: Matrix, y: Vector) -> Self:
+        self.classes_ = np.unique(y)  # sklearn requires this
+        if (n_classes := len(self.classes_)) > 2:
+            raise ValueError(
+                f"FixedBinaryPropensityModel only supports binary outcomes but {n_classes} were provided ."
+            )
+        return self
+
+    def predict(self, X: Matrix) -> np.ndarray[Any, Any]:
+        return np.argmax(self.predict_proba(X), axis=1)
+
+    def predict_proba(self, X: pd.DataFrame) -> np.ndarray[Any, Any]:
+        return np.full((len(X), 2), [1 - self.propensity_score, self.propensity_score])
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -2,11 +2,16 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import numpy as np
+import pandas as pd
 import pytest
 from lightgbm import LGBMRegressor
 
 from metalearners.metalearner import MetaLearner
-from metalearners.utils import metalearner_factory, simplify_output
+from metalearners.utils import (
+    FixedBinaryPropensity,
+    metalearner_factory,
+    simplify_output,
+)
 
 
 @pytest.mark.parametrize("prefix", ["T"])
@@ -52,3 +57,57 @@ def test_simplify_output(input, expected):
 def test_simplify_output_raises(input):
     with pytest.raises(ValueError, match="needs to be 3-dimensional"):
         simplify_output(input)
+
+
+@pytest.mark.parametrize("use_pd", [True, False])
+def test_fixed_binary_propensity(use_pd):
+    propensity_score = 0.3
+    dominant_class = propensity_score >= 0.5
+
+    model = FixedBinaryPropensity(propensity_score=propensity_score)
+
+    n_samples = 5
+    X_train = np.ones((n_samples, 5))
+    y_train = np.ones(n_samples)
+    if use_pd:
+        X_train = pd.DataFrame(X_train)
+        y_train = pd.Series(y_train)
+
+    model.fit(X_train, y_train)
+
+    n_test_samples = 3
+    X_test = np.zeros(n_test_samples)
+
+    class_predictions = model.predict(X_test)
+    assert np.array_equal(
+        class_predictions, np.array(np.ones(n_test_samples) * dominant_class)
+    )
+
+    probability_estimates = model.predict_proba(X_test)
+    assert np.array_equal(
+        probability_estimates,
+        np.column_stack(
+            (
+                np.ones(n_test_samples) * (1 - propensity_score),
+                np.ones(n_test_samples) * propensity_score,
+            )
+        ),
+    )
+
+
+@pytest.mark.parametrize("propensity_score", [-1, 100, 1.1])
+def test_fixed_binary_propensity_not_a_propbability(propensity_score):
+    with pytest.raises(ValueError, match="between 0 and 1 but got"):
+        FixedBinaryPropensity(propensity_score=propensity_score)
+
+
+def test_fixed_binary_propensity_non_binary():
+    propensity_score = 0.3
+
+    model = FixedBinaryPropensity(propensity_score=propensity_score)
+
+    n_samples = 5
+    X_train = np.ones((n_samples, 5))
+    y_train = np.fromiter(range(n_samples), dtype=int)
+    with pytest.raises(ValueError, match="only supports binary outcomes"):
+        model.fit(X_train, y_train)