diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 53378c20..7f30b505 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -21,7 +21,9 @@ Changelog **New features** -* Added :meth:`metalearners.metalearner.MetaLearner.init_params`. +* Add :meth:`metalearners.metalearner.MetaLearner.init_params`. + +* Add :class:`metalearners.utils.FixedBinaryPropensity`. 0.8.0 (2024-07-22) @@ -29,7 +31,7 @@ Changelog **New features** -* Added :meth:`metalearners.metalearner.MetaLearner.fit_all_nuisance` and +* Add :meth:`metalearners.metalearner.MetaLearner.fit_all_nuisance` and :meth:`metalearners.metalearner.MetaLearner.fit_all_treatment`. * Add optional ``store_raw_results`` and ``store_results`` parameters to :class:`metalearners.grid_search.MetaLearnerGridSearch`. diff --git a/docs/examples/example_propensity.ipynb b/docs/examples/example_propensity.ipynb index 3a356351..7fc280f8 100644 --- a/docs/examples/example_propensity.ipynb +++ b/docs/examples/example_propensity.ipynb @@ -59,8 +59,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Creating our own estimator\n", - "--------------------------\n", + "Using a dummy estimator\n", + "-----------------------\n", "\n", "In this tutorial we will assume that we know that all observations were assigned to the\n", "treatment with a fixed probability of 0.3, which is close to the fraction of the observations\n", @@ -89,43 +89,10 @@ "dataset, we just use it for illustrational purposes.\n", "```\n", "\n", - "Now we can define our custom ``sklearn``-like classifier. We recommend inheriting from\n", - "the ``sklearn`` base classes and following the rules explained in the\n", - "[sklearn documentation](https://scikit-learn.org/stable/developers/develop.html) to avoid\n", - "having to define helper functions and ensure the correct functionality of the ``metalearners``\n", - "library." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "from sklearn.base import BaseEstimator, ClassifierMixin\n", - "from typing import Any\n", - "from typing_extensions import Self\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "\n", - "class FixedPropensityModel(ClassifierMixin, BaseEstimator):\n", - " def __init__(self, propensity_score: float) -> None:\n", - " self.propensity_score = propensity_score\n", - "\n", - " def fit(self, X: pd.DataFrame, y: pd.Series) -> Self:\n", - " self.classes_ = np.unique(y.to_numpy()) # sklearn requires this\n", - " return self\n", - "\n", - " def predict(self, X: pd.DataFrame) -> np.ndarray[Any, Any]:\n", - " return np.argmax(self.predict_proba(X), axis=1)\n", - "\n", - " def predict_proba(self, X: pd.DataFrame) -> np.ndarray[Any, Any]:\n", - " return np.full((len(X), 2), [1 - self.propensity_score, self.propensity_score])" + "Now we can use a custom ``sklearn``-like classifier: {class}`~metalearners.utils.FixedBinaryPropensity`.\n", + "The latter can be used like any ``sklearn`` classifier but will always return the same propensity,\n", + "independently of the observed covariates. This propensity has to be provided at initialization via the\n", + "``propensity_score`` parameter." ] }, { @@ -149,11 +116,12 @@ "outputs": [], "source": [ "from metalearners import RLearner\n", + "from metalearners.utils import FixedBinaryPropensity\n", "from lightgbm import LGBMRegressor\n", "\n", "rlearner = RLearner(\n", " nuisance_model_factory=LGBMRegressor,\n", - " propensity_model_factory=FixedPropensityModel,\n", + " propensity_model_factory=FixedBinaryPropensity,\n", " treatment_model_factory=LGBMRegressor,\n", " nuisance_model_params={\"verbose\": -1},\n", " propensity_model_params={\"propensity_score\": 0.3},\n", @@ -205,10 +173,24 @@ } ], "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/metalearners/utils.py b/metalearners/utils.py index 765e6c1d..587bef67 100644 --- a/metalearners/utils.py +++ b/metalearners/utils.py @@ -1,8 +1,14 @@ # Copyright (c) QuantCo 2024-2024 # SPDX-License-Identifier: BSD-3-Clause +from typing import Any + import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, ClassifierMixin +from typing_extensions import Self +from metalearners._typing import Matrix, Vector from metalearners.drlearner import DRLearner from metalearners.metalearner import MetaLearner from metalearners.rlearner import RLearner @@ -73,3 +79,29 @@ def simplify_output(tensor: np.ndarray) -> np.ndarray: if n_outputs == 2: return tensor[:, :, 1].reshape(n_obs, n_variants) return tensor + + +class FixedBinaryPropensity(ClassifierMixin, BaseEstimator): + """Binary classifier propensity dummy model which outputs a fixed propensity, + independently of covariates.""" + + def __init__(self, propensity_score: float) -> None: + if not 0 <= propensity_score <= 1: + raise ValueError( + f"Expected a propensity score between 0 and 1 but got {propensity_score}." + ) + self.propensity_score = propensity_score + + def fit(self, X: Matrix, y: Vector) -> Self: + self.classes_ = np.unique(y) # sklearn requires this + if (n_classes := len(self.classes_)) > 2: + raise ValueError( + f"FixedBinaryPropensityModel only supports binary outcomes but {n_classes} were provided ." + ) + return self + + def predict(self, X: Matrix) -> np.ndarray[Any, Any]: + return np.argmax(self.predict_proba(X), axis=1) + + def predict_proba(self, X: pd.DataFrame) -> np.ndarray[Any, Any]: + return np.full((len(X), 2), [1 - self.propensity_score, self.propensity_score]) diff --git a/tests/test_utils.py b/tests/test_utils.py index c3f334ec..d88b5d14 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,11 +2,16 @@ # SPDX-License-Identifier: BSD-3-Clause import numpy as np +import pandas as pd import pytest from lightgbm import LGBMRegressor from metalearners.metalearner import MetaLearner -from metalearners.utils import metalearner_factory, simplify_output +from metalearners.utils import ( + FixedBinaryPropensity, + metalearner_factory, + simplify_output, +) @pytest.mark.parametrize("prefix", ["T"]) @@ -52,3 +57,57 @@ def test_simplify_output(input, expected): def test_simplify_output_raises(input): with pytest.raises(ValueError, match="needs to be 3-dimensional"): simplify_output(input) + + +@pytest.mark.parametrize("use_pd", [True, False]) +def test_fixed_binary_propensity(use_pd): + propensity_score = 0.3 + dominant_class = propensity_score >= 0.5 + + model = FixedBinaryPropensity(propensity_score=propensity_score) + + n_samples = 5 + X_train = np.ones((n_samples, 5)) + y_train = np.ones(n_samples) + if use_pd: + X_train = pd.DataFrame(X_train) + y_train = pd.Series(y_train) + + model.fit(X_train, y_train) + + n_test_samples = 3 + X_test = np.zeros(n_test_samples) + + class_predictions = model.predict(X_test) + assert np.array_equal( + class_predictions, np.array(np.ones(n_test_samples) * dominant_class) + ) + + probability_estimates = model.predict_proba(X_test) + assert np.array_equal( + probability_estimates, + np.column_stack( + ( + np.ones(n_test_samples) * (1 - propensity_score), + np.ones(n_test_samples) * propensity_score, + ) + ), + ) + + +@pytest.mark.parametrize("propensity_score", [-1, 100, 1.1]) +def test_fixed_binary_propensity_not_a_propbability(propensity_score): + with pytest.raises(ValueError, match="between 0 and 1 but got"): + FixedBinaryPropensity(propensity_score=propensity_score) + + +def test_fixed_binary_propensity_non_binary(): + propensity_score = 0.3 + + model = FixedBinaryPropensity(propensity_score=propensity_score) + + n_samples = 5 + X_train = np.ones((n_samples, 5)) + y_train = np.fromiter(range(n_samples), dtype=int) + with pytest.raises(ValueError, match="only supports binary outcomes"): + model.fit(X_train, y_train)