From c8fdb6a86d76813cd772376c4e011986b35179ae Mon Sep 17 00:00:00 2001
From: Kevin Klein <7267523+kklein@users.noreply.github.com>
Date: Wed, 26 Jun 2024 09:02:54 +0200
Subject: [PATCH 1/4] Add a development page to docs (#28)
* Draft development page.
* Fix formatting.
* Add missing postinstall.
---
docs/development.rst | 38 ++++++++++++++++++++++++++++++++++++++
docs/index.rst | 1 +
2 files changed, 39 insertions(+)
create mode 100644 docs/development.rst
diff --git a/docs/development.rst b/docs/development.rst
new file mode 100644
index 00000000..711520dc
--- /dev/null
+++ b/docs/development.rst
@@ -0,0 +1,38 @@
+Development
+===========
+
+The ``metalearners`` repository can be cloned as follows
+
+.. code-block:: console
+
+ git clone https://github.com/Quantco/metalearners.git
+
+The dependencies are managed via
+`pixi `_. Please make sure to install ``pixi`` on
+your system. Once pixi is installed, you can install and run the
+pre-commit hooks as follows:
+
+
+.. code-block:: console
+
+ pixi run pre-commit-install
+ pixi run pre-commit-run
+
+
+You can run the tests as follows:
+
+.. code-block:: console
+
+ pixi run postinstall
+ pixi run pytest tests
+
+You can build the documentation locally by running
+
+.. code-block:: console
+
+ pixi run -e docs postinstall
+ pixi run -e docs docs
+
+You can then inspect the locally built docs by opening ``docs/_build/index.html``.
+
+You can find all pixi tasks in ``pixi.toml``.
diff --git a/docs/index.rst b/docs/index.rst
index 11e99434..f04a8eed 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -10,6 +10,7 @@ Welcome to metalearners's documentation!
Glossary
What about parallelism?
Examples
+ Development
FAQ
API Reference
Change Log
From 2adf3af60867d10f9feb7a03cfb43255f562cc64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Francesc=20Mart=C3=AD=20Escofet?=
<154450563+FrancescMartiEscofetQC@users.noreply.github.com>
Date: Wed, 26 Jun 2024 13:43:00 +0200
Subject: [PATCH 2/4] Remove verbosity (#29)
---
docs/examples/example_reuse.ipynb | 2 ++
1 file changed, 2 insertions(+)
diff --git a/docs/examples/example_reuse.ipynb b/docs/examples/example_reuse.ipynb
index dc4ab6c1..ad9542f7 100644
--- a/docs/examples/example_reuse.ipynb
+++ b/docs/examples/example_reuse.ipynb
@@ -253,6 +253,8 @@
"drlearner = DRLearner(\n",
" nuisance_model_factory=LGBMRegressor,\n",
" treatment_model_factory=LGBMRegressor,\n",
+ " nuisance_model_params={\"verbose\": -1},\n",
+ " treatment_model_params={\"verbose\": -1},\n",
" fitted_propensity_model=trained_propensity_model,\n",
" is_classification=False,\n",
" n_variants=2,\n",
From 34b9cde5573f572484390c973c1bfaf239328bd4 Mon Sep 17 00:00:00 2001
From: Kevin Klein <7267523+kklein@users.noreply.github.com>
Date: Wed, 26 Jun 2024 18:12:06 +0200
Subject: [PATCH 3/4] Enable Windows unit tests in CI (#32)
* Enable Windows tests.
* Use pandas' check for int.
---
.github/workflows/ci.yml | 2 +-
metalearners/_utils.py | 5 +++--
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e40cdc9d..02d971c0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -47,7 +47,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- os: [ubuntu-latest, macos-latest]
+ os: [ubuntu-latest, macos-latest, windows-latest]
env: ["py310", "py311", "py312"]
steps:
- name: Checkout branch
diff --git a/metalearners/_utils.py b/metalearners/_utils.py
index 2767d5ae..a39c2c24 100644
--- a/metalearners/_utils.py
+++ b/metalearners/_utils.py
@@ -234,9 +234,10 @@ def convert_treatment(treatment: Vector) -> np.ndarray:
new_treatment = treatment.to_numpy()
if new_treatment.dtype == bool:
return new_treatment.astype(int)
- elif new_treatment.dtype == float and all(x.is_integer() for x in new_treatment):
+ if new_treatment.dtype == float and all(x.is_integer() for x in new_treatment):
return new_treatment.astype(int)
- elif new_treatment.dtype != int:
+
+ if not pd.api.types.is_integer_dtype(new_treatment):
raise TypeError(
"Treatment must be boolean, integer or float with integer values."
)
From 2f530b21c4fa26f6cf7afa4ae0b7ca1060f02ad3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Francesc=20Mart=C3=AD=20Escofet?=
<154450563+FrancescMartiEscofetQC@users.noreply.github.com>
Date: Wed, 26 Jun 2024 18:36:41 +0200
Subject: [PATCH 4/4] Parametrize evaluate (#8)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* Speedup tests
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
* Switch `strict` meaning in `validate_number_positive`
* Add classes_ to cfe
* Fix RLoss calculation in evaluate
* Parametrize evaluate
* run pchs
* Update CHANGELOG
* Update metalearners/metalearner.py
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
* Update metalearners/metalearner.py
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
* Update metalearners/metalearner.py
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
* Update metalearners/metalearner.py
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
* Update metalearners/metalearner.py
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
* Fix naming
* Update metalearners/metalearner.py
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
* Fix docs
* Don't force subset
* Add test to ignore
* Centralize generation of default scoring (#22)
* Centralize generation of default scoring.
* Reuse more type hints.
* Update metalearners/metalearner.py
Co-authored-by: Francesc Martí Escofet <154450563+FrancescMartiEscofetQC@users.noreply.github.com>
* Update metalearners/metalearner.py
Co-authored-by: Francesc Martí Escofet <154450563+FrancescMartiEscofetQC@users.noreply.github.com>
* Apply pchs.
---------
Co-authored-by: Francesc Martí Escofet <154450563+FrancescMartiEscofetQC@users.noreply.github.com>
* Update metalearners/metalearner.py
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
* Update metalearners/tlearner.py
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
* Update metalearners/xlearner.py
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
* Update metalearners/metalearner.py
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
* Rename
* Rename
* Rename
* Rename
* Rename
* Rename
* Update metalearners/drlearner.py
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
* Update metalearners/_utils.py
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
* Update CHANGELOG
* Add option to evaluate treatment model in RLearner
---------
Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com>
---
CHANGELOG.rst | 7 ++
metalearners/_typing.py | 7 +-
metalearners/_utils.py | 23 ++++
metalearners/cross_fit_estimator.py | 9 +-
metalearners/drlearner.py | 72 +++++++++--
metalearners/metalearner.py | 106 +++++++++++++++-
metalearners/rlearner.py | 103 +++++++++++++---
metalearners/slearner.py | 29 +++--
metalearners/tlearner.py | 34 +++---
metalearners/xlearner.py | 69 ++++++++++-
tests/test_learner.py | 179 ++++++++++++++++++++++++++--
11 files changed, 556 insertions(+), 82 deletions(-)
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index c4ef4d47..4526f786 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,6 +7,13 @@
Changelog
=========
+0.6.0 (2024-06-**)
+------------------
+
+* Added ``scoring`` parameter to :meth:`metalearners.metalearner.MetaLearner.evaluate` and
+ implemented the abstract method for the :class:`metalearners.XLearner` and
+ :class:`metalearners.DRLearner`.
+
0.5.0 (2024-06-18)
------------------
diff --git a/metalearners/_typing.py b/metalearners/_typing.py
index c23207e2..76b31ba5 100644
--- a/metalearners/_typing.py
+++ b/metalearners/_typing.py
@@ -1,7 +1,7 @@
# Copyright (c) QuantCo 2024-2024
# SPDX-License-Identifier: BSD-3-Clause
-from collections.abc import Collection, Mapping
+from collections.abc import Callable, Collection, Mapping, Sequence
from typing import Literal, Protocol, Union
import numpy as np
@@ -29,7 +29,6 @@ class _ScikitModel(Protocol):
# https://stackoverflow.com/questions/54868698/what-type-is-a-sklearn-model/60542986#60542986
def fit(self, X, y, *params, **kwargs): ...
-
def predict(self, X, *params, **kwargs): ...
def score(self, X, y, **kwargs): ...
@@ -44,3 +43,7 @@ def set_params(self, **params): ...
# For instance, if converting the Generator resulting from a call to
# sklearn.model_selection.KFold.split to a list we obtain this type.
SplitIndices = list[tuple[np.ndarray, np.ndarray]]
+
+Scorer = str | Callable
+Scorers = Sequence[Scorer]
+Scoring = Mapping[str, Scorers]
diff --git a/metalearners/_utils.py b/metalearners/_utils.py
index a39c2c24..a4c7187b 100644
--- a/metalearners/_utils.py
+++ b/metalearners/_utils.py
@@ -464,3 +464,26 @@ def simplify_output_2d(tensor: np.ndarray) -> np.ndarray:
"This function requires a regression or a classification with binary outcome "
"task."
)
+
+
+# Taken from https://stackoverflow.com/questions/13741998/is-there-a-way-to-let-classes-inherit-the-documentation-of-their-superclass-with
+def copydoc(fromfunc, sep="\n"):
+ """
+ Decorator: Copy the docstring of ``fromfunc``
+ """
+
+ def _decorator(func):
+ sourcedoc = fromfunc.__doc__
+ if func.__doc__ is None:
+ func.__doc__ = sourcedoc
+ else:
+ func.__doc__ = sep.join([sourcedoc, func.__doc__])
+ return func
+
+ return _decorator
+
+
+def default_metric(predict_method: PredictMethod) -> str:
+ if predict_method == _PREDICT_PROBA:
+ return "neg_log_loss"
+ return "neg_root_mean_squared_error"
diff --git a/metalearners/cross_fit_estimator.py b/metalearners/cross_fit_estimator.py
index 3f8624c9..67604138 100644
--- a/metalearners/cross_fit_estimator.py
+++ b/metalearners/cross_fit_estimator.py
@@ -362,12 +362,17 @@ def __init__(
self.original_predict_proba = model.predict_proba
def __enter__(self):
- self.model.predict = partial( # type: ignore
+ new_predict = partial(
self.model.predict, is_oos=self.is_oos, oos_method=self.oos_method
)
- self.model.predict_proba = partial( # type: ignore
+ new_predict.__name__ = "predict" # type: ignore
+ self.model.predict = new_predict # type: ignore
+
+ new_predict_proba = partial(
self.model.predict_proba, is_oos=self.is_oos, oos_method=self.oos_method
)
+ new_predict_proba.__name__ = "predict_proba" # type: ignore
+ self.model.predict_proba = new_predict_proba # type: ignore
return self.model
def __exit__(self, *args):
diff --git a/metalearners/drlearner.py b/metalearners/drlearner.py
index 4ff15c20..944b33ae 100644
--- a/metalearners/drlearner.py
+++ b/metalearners/drlearner.py
@@ -1,11 +1,12 @@
# Copyright (c) QuantCo 2024-2024
# SPDX-License-Identifier: BSD-3-Clause
+
import numpy as np
from joblib import Parallel, delayed
from typing_extensions import Self
-from metalearners._typing import Matrix, OosMethod, Vector
+from metalearners._typing import Matrix, OosMethod, Scoring, Vector
from metalearners._utils import (
clip_element_absolute_value_to_epsilon,
get_one,
@@ -23,6 +24,7 @@
VARIANT_OUTCOME_MODEL,
MetaLearner,
_ConditionalAverageOutcomeMetaLearner,
+ _evaluate_model_kind,
_fit_cross_fit_estimator_joblib,
_ModelSpecifications,
_ParallelJoblibSpecification,
@@ -148,6 +150,7 @@ def fit(
w=w,
y=y,
treatment_variant=treatment_variant,
+ is_oos=False,
)
treatment_jobs.append(
@@ -205,37 +208,82 @@ def evaluate(
w: Vector,
is_oos: bool,
oos_method: OosMethod = OVERALL,
- ) -> dict[str, float | int]:
- raise NotImplementedError(
- "This feature is not yet implemented for the DR-Learner."
+ scoring: Scoring | None = None,
+ ) -> dict[str, float]:
+ safe_scoring = self._scoring(scoring)
+
+ variant_outcome_evaluation = _evaluate_model_kind(
+ cfes=self._nuisance_models[VARIANT_OUTCOME_MODEL],
+ Xs=[X[w == tv] for tv in range(self.n_variants)],
+ ys=[y[w == tv] for tv in range(self.n_variants)],
+ scorers=safe_scoring[VARIANT_OUTCOME_MODEL],
+ model_kind=VARIANT_OUTCOME_MODEL,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ is_treatment_model=False,
+ )
+
+ propensity_evaluation = _evaluate_model_kind(
+ cfes=self._nuisance_models[PROPENSITY_MODEL],
+ Xs=[X],
+ ys=[w],
+ scorers=safe_scoring[PROPENSITY_MODEL],
+ model_kind=PROPENSITY_MODEL,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ is_treatment_model=False,
+ )
+
+ pseudo_outcome: list[np.ndarray] = []
+ for treatment_variant in range(1, self.n_variants):
+ tv_pseudo_outcome = self._pseudo_outcome(
+ X=X,
+ y=y,
+ w=w,
+ treatment_variant=treatment_variant,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ )
+ pseudo_outcome.append(tv_pseudo_outcome)
+
+ treatment_evaluation = _evaluate_model_kind(
+ self._treatment_models[TREATMENT_MODEL],
+ Xs=[X for _ in range(1, self.n_variants)],
+ ys=pseudo_outcome,
+ scorers=safe_scoring[TREATMENT_MODEL],
+ model_kind=TREATMENT_MODEL,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ is_treatment_model=True,
)
+ return variant_outcome_evaluation | propensity_evaluation | treatment_evaluation
+
def _pseudo_outcome(
self,
X: Matrix,
y: Vector,
w: Vector,
treatment_variant: int,
+ is_oos: bool,
+ oos_method: OosMethod = OVERALL,
epsilon: float = _EPSILON,
) -> np.ndarray:
- """Compute the DR-Learner pseudo outcome.
-
- Importantly, this method assumes to be applied on in-sample data.
- In other words, ``is_oos`` will always be set to ``False`` when calling
- ``predict_nuisance``.
- """
+ """Compute the DR-Learner pseudo outcome."""
validate_valid_treatment_variant_not_control(treatment_variant, self.n_variants)
conditional_average_outcome_estimates = (
self.predict_conditional_average_outcomes(
X=X,
- is_oos=False,
+ is_oos=is_oos,
+ oos_method=oos_method,
)
)
propensity_estimates = self.predict_nuisance(
X=X,
- is_oos=False,
+ is_oos=is_oos,
+ oos_method=oos_method,
model_kind=PROPENSITY_MODEL,
model_ord=0,
)
diff --git a/metalearners/metalearner.py b/metalearners/metalearner.py
index c9539fde..4bce6359 100644
--- a/metalearners/metalearner.py
+++ b/metalearners/metalearner.py
@@ -2,7 +2,7 @@
# SPDX-License-Identifier: BSD-3-Clause
from abc import ABC, abstractmethod
-from collections.abc import Callable, Collection
+from collections.abc import Callable, Collection, Mapping, Sequence
from copy import deepcopy
from dataclasses import dataclass
from typing import TypedDict
@@ -10,6 +10,7 @@
import numpy as np
import pandas as pd
import shap
+from sklearn.metrics import get_scorer
from sklearn.model_selection import KFold
from typing_extensions import Self
@@ -20,11 +21,13 @@
OosMethod,
Params,
PredictMethod,
+ Scoring,
SplitIndices,
Vector,
_ScikitModel,
)
from metalearners._utils import (
+ default_metric,
index_matrix,
validate_model_and_predict_method,
validate_number_positive,
@@ -32,6 +35,7 @@
from metalearners.cross_fit_estimator import (
OVERALL,
CrossFitEstimator,
+ _PredictContext,
)
from metalearners.explainer import Explainer
@@ -133,6 +137,49 @@ def _validate_n_folds_synchronize(n_folds: dict[str, int]) -> None:
raise ValueError("Need at least two folds to use synchronization.")
+def _evaluate_model_kind(
+ cfes: Sequence[CrossFitEstimator],
+ Xs: Sequence[Matrix],
+ ys: Sequence[Vector],
+ scorers: Sequence[str | Callable],
+ model_kind: str,
+ is_oos: bool,
+ is_treatment_model: bool,
+ oos_method: OosMethod = OVERALL,
+ sample_weights: Sequence[Vector] | None = None,
+) -> dict[str, float]:
+ """Helper function to evaluate all the models of the same model kind."""
+ prefix = f"{model_kind}_"
+ evaluation_metrics: dict[str, float] = {}
+ for idx, scorer in enumerate(scorers):
+ if isinstance(scorer, str):
+ scorer_name = scorer
+ scorer_callable: Callable = get_scorer(scorer)
+ else:
+ scorer_name = f"custom_scorer_{idx}"
+ scorer_callable = scorer
+ for i, cfe in enumerate(cfes):
+ if is_treatment_model:
+ treatment_variant = i + 1
+ index_str = f"{treatment_variant}_vs_0_"
+ else:
+ if len(cfes) == 1:
+ index_str = ""
+ else:
+ index_str = f"{i}_"
+ name = f"{prefix}{index_str}{scorer_name}"
+ with _PredictContext(cfe, is_oos, oos_method) as modified_cfe:
+ if sample_weights:
+ evaluation_metrics[name] = scorer_callable(
+ modified_cfe, Xs[i], ys[i], sample_weight=sample_weights[i]
+ )
+ else:
+ evaluation_metrics[name] = scorer_callable(
+ modified_cfe, Xs[i], ys[i]
+ )
+ return evaluation_metrics
+
+
class _ModelSpecifications(TypedDict):
# The quotes on MetaLearner are necessary for type hinting as it's not yet defined
# here. Check https://stackoverflow.com/questions/55320236/does-python-evaluate-type-hinting-of-a-forward-reference
@@ -809,8 +856,40 @@ def evaluate(
w: Vector,
is_oos: bool,
oos_method: OosMethod = OVERALL,
- ) -> dict[str, float | int]:
- """Evaluate all models contained in a MetaLearner."""
+ scoring: Mapping[str, list[str | Callable]] | None = None,
+ ) -> dict[str, float]:
+ r"""Evaluate the MetaLearner.
+
+ The keys in ``scoring`` which are not a name of a model contained in the MetaLearner
+ will be ignored, for information about this names check
+ :meth:`~metalearners.metalearner.MetaLearner.nuisance_model_specifications` and
+ :meth:`~metalearners.metalearner.MetaLearner.treatment_model_specifications`.
+ The values must be a list of:
+
+ * ``string`` representing a ``sklearn`` scoring method. Check
+ `here `__
+ for the possible values.
+ * ``Callable`` with signature ``scorer(estimator, X, y_true, **kwargs)``. We recommend
+ using `sklearn.metrics.make_scorer `_
+ to create such a ``Callable``.
+
+ If some model name is not present in the keys of ``scoring`` then the default used
+ metrics will be ``neg_log_loss`` if it is a classifier and ``neg_root_mean_squared_error``
+ if it is a regressor.
+
+ The returned dictionary keys have the following structure:
+
+ * For nuisance models:
+
+ * If the cardinality is one: ``f"{model_kind}_{scorer}"``
+ * If there is one model for each treatment variant (including control):
+ ``f"{model_kind}_{treatment_variant}_{scorer}"``
+
+ * For treatment models: ``f"{model_kind}_{treatment_variant}_vs_0_{scorer}"``
+
+ Where ``scorer`` is the name of the scorer if it is a string and ``"custom_scorer_{idx}"``
+ if it is a callable where ``idx`` is the index in the ``scorers`` list.
+ """
...
def explainer(
@@ -940,6 +1019,27 @@ def shap_values(
shap_explainer_params=shap_explainer_params,
)
+ def _scoring(self, scoring: Scoring | None) -> Scoring:
+
+ def _default_scoring() -> Scoring:
+ return {
+ nuisance_model: [
+ default_metric(model_specifications["predict_method"](self))
+ ]
+ for nuisance_model, model_specifications in self.nuisance_model_specifications().items()
+ } | {
+ treatment_model: [
+ default_metric(model_specifications["predict_method"](self))
+ ]
+ for treatment_model, model_specifications in self.treatment_model_specifications().items()
+ }
+
+ default_scoring = _default_scoring()
+
+ if scoring is None:
+ return default_scoring
+ return dict(default_scoring) | dict(scoring)
+
class _ConditionalAverageOutcomeMetaLearner(MetaLearner, ABC):
diff --git a/metalearners/rlearner.py b/metalearners/rlearner.py
index c66dcc9f..ee6e45df 100644
--- a/metalearners/rlearner.py
+++ b/metalearners/rlearner.py
@@ -1,14 +1,16 @@
# Copyright (c) QuantCo 2024-2024
# SPDX-License-Identifier: BSD-3-Clause
+
import numpy as np
from joblib import Parallel, delayed
-from sklearn.metrics import log_loss, root_mean_squared_error
+from sklearn.metrics import root_mean_squared_error
from typing_extensions import Self
-from metalearners._typing import Matrix, OosMethod, Vector
+from metalearners._typing import Matrix, OosMethod, Scoring, Vector
from metalearners._utils import (
clip_element_absolute_value_to_epsilon,
+ copydoc,
function_has_argument,
get_one,
get_predict,
@@ -24,6 +26,7 @@
TREATMENT,
TREATMENT_MODEL,
MetaLearner,
+ _evaluate_model_kind,
_fit_cross_fit_estimator_joblib,
_ModelSpecifications,
_ParallelJoblibSpecification,
@@ -227,6 +230,7 @@ def fit(
treatment_variant=treatment_variant,
mask=mask,
epsilon=epsilon,
+ is_oos=False,
)
X_filtered = index_matrix(X, mask)
@@ -323,6 +327,7 @@ def predict(
tau_hat[variant_indices, treatment_variant - 1] = variant_estimates
return tau_hat
+ @copydoc(MetaLearner.evaluate, sep="\n\t")
def evaluate(
self,
X: Matrix,
@@ -330,7 +335,37 @@ def evaluate(
w: Vector,
is_oos: bool,
oos_method: OosMethod = OVERALL,
- ) -> dict[str, float | int]:
+ scoring: Scoring | None = None,
+ ) -> dict[str, float]:
+ """In the RLearner case, the ``"treatment_model"`` is always evaluated with the
+ :func:`~metalearners.rlearner.r_loss` besides the scorers in
+ ``scoring["treatment_model"]``, which should support passing the
+ ``sample_weight`` keyword argument."""
+ safe_scoring = self._scoring(scoring)
+
+ propensity_evaluation = _evaluate_model_kind(
+ cfes=self._nuisance_models[PROPENSITY_MODEL],
+ Xs=[X],
+ ys=[w],
+ scorers=safe_scoring[PROPENSITY_MODEL],
+ model_kind=PROPENSITY_MODEL,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ is_treatment_model=False,
+ )
+
+ outcome_evaluation = _evaluate_model_kind(
+ cfes=self._nuisance_models[OUTCOME_MODEL],
+ Xs=[X],
+ ys=[y],
+ scorers=safe_scoring[OUTCOME_MODEL],
+ model_kind=OUTCOME_MODEL,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ is_treatment_model=False,
+ )
+
+ # TODO: improve this? generalize it to other metalearners?
w_hat = self.predict_nuisance(
X=X,
is_oos=is_oos,
@@ -338,7 +373,6 @@ def evaluate(
model_kind=PROPENSITY_MODEL,
model_ord=0,
)
- propensity_evaluation = {"propensity_cross_entropy": log_loss(w, w_hat)}
y_hat = self.predict_nuisance(
X=X,
@@ -350,13 +384,39 @@ def evaluate(
if self.is_classification:
y_hat = y_hat[:, 1]
- outcome_evaluation = (
- {"outcome_log_loss": log_loss(y, y_hat)}
- if self.is_classification
- else {"outcome_rmse": root_mean_squared_error(y, y_hat)}
+ pseudo_outcome: list[np.ndarray] = []
+ sample_weights: list[np.ndarray] = []
+ masks: list[Vector] = []
+ is_control = w == 0
+ for treatment_variant in range(1, self.n_variants):
+ is_treatment = w == treatment_variant
+ mask = is_treatment | is_control
+ tv_pseudo_outcome, tv_sample_weights = self._pseudo_outcome_and_weights(
+ X=X,
+ y=y,
+ w=w,
+ treatment_variant=treatment_variant,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ mask=mask,
+ )
+ pseudo_outcome.append(tv_pseudo_outcome)
+ sample_weights.append(tv_sample_weights)
+ masks.append(mask)
+
+ treatment_evaluation = _evaluate_model_kind(
+ self._treatment_models[TREATMENT_MODEL],
+ Xs=[X[masks[tv - 1]] for tv in range(1, self.n_variants)],
+ ys=pseudo_outcome,
+ scorers=safe_scoring[TREATMENT_MODEL],
+ model_kind=TREATMENT_MODEL,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ is_treatment_model=True,
+ sample_weights=sample_weights,
)
- treatment_evaluation = {}
+ rloss_evaluation = {}
tau_hat = self.predict(X=X, is_oos=is_oos, oos_method=oos_method)
is_control = w == 0
for treatment_variant in range(1, self.n_variants):
@@ -371,15 +431,19 @@ def evaluate(
if self.is_classification
else tau_hat[:, treatment_variant - 1, 0]
)
- treatment_evaluation[f"r_loss_{treatment_variant}_vs_0"] = r_loss(
+ rloss_evaluation[f"r_loss_{treatment_variant}_vs_0"] = r_loss(
cate_estimates=cate_estimates[mask],
outcome_estimates=y_hat[mask],
propensity_scores=propensity_estimates[mask],
outcomes=y[mask],
treatments=w[mask] == treatment_variant,
)
-
- return propensity_evaluation | outcome_evaluation | treatment_evaluation
+ return (
+ propensity_evaluation
+ | outcome_evaluation
+ | rloss_evaluation
+ | treatment_evaluation
+ )
def _pseudo_outcome_and_weights(
self,
@@ -387,15 +451,13 @@ def _pseudo_outcome_and_weights(
y: Vector,
w: Vector,
treatment_variant: int,
+ is_oos: bool,
+ oos_method: OosMethod = OVERALL,
mask: Vector | None = None,
epsilon: float = _EPSILON,
) -> tuple[np.ndarray, np.ndarray]:
"""Compute the R-Learner pseudo outcome and corresponding weights.
- Importantly, this method assumes to be applied on in-sample data.
- In other words, ``is_oos`` will always be set to ``False`` when calling
- ``predict_nuisance``.
-
If ``mask`` is provided, the retuned pseudo outcomes and weights are only
with respect the observations that the mask selects.
@@ -411,12 +473,17 @@ def _pseudo_outcome_and_weights(
# be able to match original observations with their corresponding folds.
y_estimates = self.predict_nuisance(
X=X,
- is_oos=False,
+ is_oos=is_oos,
model_kind=OUTCOME_MODEL,
model_ord=0,
+ oos_method=oos_method,
)[mask]
w_estimates = self.predict_nuisance(
- X=X, is_oos=False, model_kind=PROPENSITY_MODEL, model_ord=0
+ X=X,
+ is_oos=is_oos,
+ model_kind=PROPENSITY_MODEL,
+ model_ord=0,
+ oos_method=oos_method,
)[mask]
w_estimates_binarized = w_estimates[:, treatment_variant] / (
w_estimates[:, 0] + w_estimates[:, treatment_variant]
diff --git a/metalearners/slearner.py b/metalearners/slearner.py
index 9d42522b..9d49e20c 100644
--- a/metalearners/slearner.py
+++ b/metalearners/slearner.py
@@ -5,7 +5,6 @@
import numpy as np
import pandas as pd
-from sklearn.metrics import log_loss, root_mean_squared_error
from typing_extensions import Self
from metalearners._typing import (
@@ -14,6 +13,7 @@
ModelFactory,
OosMethod,
Params,
+ Scoring,
Vector,
_ScikitModel,
)
@@ -23,7 +23,12 @@
supports_categoricals,
)
from metalearners.cross_fit_estimator import OVERALL, CrossFitEstimator
-from metalearners.metalearner import NUISANCE, MetaLearner, _ModelSpecifications
+from metalearners.metalearner import (
+ NUISANCE,
+ MetaLearner,
+ _evaluate_model_kind,
+ _ModelSpecifications,
+)
_BASE_MODEL = "base_model"
@@ -191,17 +196,23 @@ def evaluate(
w: Vector,
is_oos: bool,
oos_method: OosMethod = OVERALL,
- ) -> dict[str, float | int]:
- # TODO: Parameterize evaluation approaches.
+ scoring: Scoring | None = None,
+ ) -> dict[str, float]:
+ safe_scoring = self._scoring(scoring)
+
X_with_w = _append_treatment_to_covariates(
X, w, self._supports_categoricals, self.n_variants
)
- y_pred = self.predict_nuisance(
- X=X_with_w, model_kind=_BASE_MODEL, model_ord=0, is_oos=is_oos
+ return _evaluate_model_kind(
+ cfes=self._nuisance_models[_BASE_MODEL],
+ Xs=[X_with_w],
+ ys=[y],
+ scorers=safe_scoring[_BASE_MODEL],
+ model_kind=_BASE_MODEL,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ is_treatment_model=False,
)
- if self.is_classification:
- return {"cross_entropy": log_loss(y, y_pred)}
- return {"rmse": root_mean_squared_error(y, y_pred)}
def predict_conditional_average_outcomes(
self, X: Matrix, is_oos: bool, oos_method: OosMethod = OVERALL
diff --git a/metalearners/tlearner.py b/metalearners/tlearner.py
index f4673c67..81f4ae40 100644
--- a/metalearners/tlearner.py
+++ b/metalearners/tlearner.py
@@ -3,10 +3,9 @@
import numpy as np
from joblib import Parallel, delayed
-from sklearn.metrics import log_loss, root_mean_squared_error
from typing_extensions import Self
-from metalearners._typing import Matrix, OosMethod, Vector
+from metalearners._typing import Matrix, OosMethod, Scoring, Vector
from metalearners._utils import index_matrix
from metalearners.cross_fit_estimator import OVERALL
from metalearners.metalearner import (
@@ -14,6 +13,7 @@
VARIANT_OUTCOME_MODEL,
MetaLearner,
_ConditionalAverageOutcomeMetaLearner,
+ _evaluate_model_kind,
_fit_cross_fit_estimator_joblib,
_ModelSpecifications,
_ParallelJoblibSpecification,
@@ -113,21 +113,17 @@ def evaluate(
w: Vector,
is_oos: bool,
oos_method: OosMethod = OVERALL,
- ) -> dict[str, float | int]:
- # TODO: Parametrize evaluation approaches.
- conditional_average_outcomes = self.predict_conditional_average_outcomes(
- X=X, is_oos=is_oos, oos_method=oos_method
+ scoring: Scoring | None = None,
+ ) -> dict[str, float]:
+ safe_scoring = self._scoring(scoring)
+
+ return _evaluate_model_kind(
+ cfes=self._nuisance_models[VARIANT_OUTCOME_MODEL],
+ Xs=[X[w == tv] for tv in range(self.n_variants)],
+ ys=[y[w == tv] for tv in range(self.n_variants)],
+ scorers=safe_scoring[VARIANT_OUTCOME_MODEL],
+ model_kind=VARIANT_OUTCOME_MODEL,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ is_treatment_model=False,
)
- evaluation_metrics = {}
- for treatment_variant in range(self.n_variants):
- prefix = f"variant_{treatment_variant}"
- variant_outcomes = conditional_average_outcomes[:, treatment_variant]
- if self.is_classification:
- evaluation_metrics[f"{prefix}_cross_entropy"] = log_loss(
- y[w == treatment_variant], variant_outcomes[w == treatment_variant]
- )
- else:
- evaluation_metrics[f"{prefix}_rmse"] = root_mean_squared_error(
- y[w == treatment_variant], variant_outcomes[w == treatment_variant]
- )
- return evaluation_metrics
diff --git a/metalearners/xlearner.py b/metalearners/xlearner.py
index ab62926c..1ef25393 100644
--- a/metalearners/xlearner.py
+++ b/metalearners/xlearner.py
@@ -1,11 +1,12 @@
# Copyright (c) QuantCo 2024-2024
# SPDX-License-Identifier: BSD-3-Clause
+
import numpy as np
from joblib import Parallel, delayed
from typing_extensions import Self
-from metalearners._typing import Matrix, OosMethod, Vector
+from metalearners._typing import Matrix, OosMethod, Scoring, Vector
from metalearners._utils import (
get_one,
get_predict,
@@ -21,6 +22,7 @@
VARIANT_OUTCOME_MODEL,
MetaLearner,
_ConditionalAverageOutcomeMetaLearner,
+ _evaluate_model_kind,
_fit_cross_fit_estimator_joblib,
_ModelSpecifications,
_ParallelJoblibSpecification,
@@ -285,9 +287,68 @@ def evaluate(
w: Vector,
is_oos: bool,
oos_method: OosMethod = OVERALL,
- ) -> dict[str, float | int]:
- raise NotImplementedError(
- "This feature is not yet implemented for the X-Learner."
+ scoring: Scoring | None = None,
+ ) -> dict[str, float]:
+ safe_scoring = self._scoring(scoring)
+
+ variant_outcome_evaluation = _evaluate_model_kind(
+ cfes=self._nuisance_models[VARIANT_OUTCOME_MODEL],
+ Xs=[X[w == tv] for tv in range(self.n_variants)],
+ ys=[y[w == tv] for tv in range(self.n_variants)],
+ scorers=safe_scoring[VARIANT_OUTCOME_MODEL],
+ model_kind=VARIANT_OUTCOME_MODEL,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ is_treatment_model=False,
+ )
+
+ propensity_evaluation = _evaluate_model_kind(
+ cfes=self._nuisance_models[PROPENSITY_MODEL],
+ Xs=[X],
+ ys=[w],
+ scorers=safe_scoring[PROPENSITY_MODEL],
+ model_kind=PROPENSITY_MODEL,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ is_treatment_model=False,
+ )
+
+ imputed_te_control: list[np.ndarray] = []
+ imputed_te_treatment: list[np.ndarray] = []
+ for treatment_variant in range(1, self.n_variants):
+ tv_imputed_te_control, tv_imputed_te_treatment = self._pseudo_outcome(
+ X, y, w, treatment_variant
+ )
+ imputed_te_control.append(tv_imputed_te_control)
+ imputed_te_treatment.append(tv_imputed_te_treatment)
+
+ te_treatment_evaluation = _evaluate_model_kind(
+ self._treatment_models[TREATMENT_EFFECT_MODEL],
+ Xs=[X[w == tv] for tv in range(1, self.n_variants)],
+ ys=imputed_te_treatment,
+ scorers=safe_scoring[TREATMENT_EFFECT_MODEL],
+ model_kind=TREATMENT_EFFECT_MODEL,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ is_treatment_model=True,
+ )
+
+ te_control_evaluation = _evaluate_model_kind(
+ self._treatment_models[CONTROL_EFFECT_MODEL],
+ Xs=[X[w == 0] for _ in range(1, self.n_variants)],
+ ys=imputed_te_control,
+ scorers=safe_scoring[CONTROL_EFFECT_MODEL],
+ model_kind=CONTROL_EFFECT_MODEL,
+ is_oos=is_oos,
+ oos_method=oos_method,
+ is_treatment_model=True,
+ )
+
+ return (
+ variant_outcome_evaluation
+ | propensity_evaluation
+ | te_treatment_evaluation
+ | te_control_evaluation
)
def _pseudo_outcome(
diff --git a/tests/test_learner.py b/tests/test_learner.py
index efe36c76..f001eda2 100644
--- a/tests/test_learner.py
+++ b/tests/test_learner.py
@@ -5,7 +5,7 @@
import pytest
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
-from sklearn.metrics import root_mean_squared_error
+from sklearn.metrics import make_scorer, root_mean_squared_error
from sklearn.model_selection import train_test_split
from metalearners.cross_fit_estimator import _OOS_WHITELIST
@@ -309,11 +309,14 @@ def test_learner_twins(metalearner, reference_value, twins_data, rng):
assert rmse < reference_value * (1 + _OOS_REFERENCE_VALUE_TOLERANCE)
-@pytest.mark.parametrize("metalearner", ["S", "T", "R"])
+@pytest.mark.parametrize("metalearner", ["S", "T", "X", "R", "DR"])
@pytest.mark.parametrize("n_classes", [2, 5, 10])
@pytest.mark.parametrize("n_variants", [2, 5])
@pytest.mark.parametrize("is_classification", [True, False])
-def test_learner_evaluate(metalearner, is_classification, rng, n_classes, n_variants):
+@pytest.mark.parametrize("is_oos", [True, False])
+def test_learner_evaluate(
+ metalearner, is_classification, rng, n_classes, n_variants, is_oos
+):
sample_size = 1000
factory = metalearner_factory(metalearner)
if n_variants > 2 and not factory._supports_multi_treatment():
@@ -322,12 +325,17 @@ def test_learner_evaluate(metalearner, is_classification, rng, n_classes, n_vari
pytest.skip() # skip repeated tests
if is_classification and n_classes > 2 and not factory._supports_multi_class():
pytest.skip()
+ test_size = 250
X = rng.standard_normal((sample_size, 10))
+ X_test = rng.standard_normal((test_size, 10)) if is_oos else X
w = rng.integers(0, n_variants, size=sample_size)
+ w_test = rng.integers(0, n_variants, test_size) if is_oos else w
if is_classification:
y = rng.integers(0, n_classes, size=sample_size)
+ y_test = rng.integers(0, n_classes, test_size) if is_oos else y
else:
y = rng.standard_normal(sample_size)
+ y_test = rng.standard_normal(test_size) if is_oos else y
base_learner = _linear_base_learner(is_classification)
@@ -340,28 +348,173 @@ def test_learner_evaluate(metalearner, is_classification, rng, n_classes, n_vari
n_folds=2,
)
learner.fit(X=X, y=y, w=w)
- evaluation = learner.evaluate(X=X, y=y, w=w, is_oos=False)
+ evaluation = learner.evaluate(X=X_test, y=y_test, w=w_test, is_oos=is_oos)
if is_classification:
if metalearner == "S":
- assert "cross_entropy" in evaluation
- elif metalearner == "T":
+ assert set(evaluation.keys()) == {"base_model_neg_log_loss"}
+ elif metalearner in ["T", "X", "DR"]:
for v in range(n_variants):
- assert f"variant_{v}_cross_entropy" in evaluation
+ assert f"variant_outcome_model_{v}_neg_log_loss" in evaluation
elif metalearner == "R":
- assert "outcome_log_loss" in evaluation
+ assert "outcome_model_neg_log_loss" in evaluation
else:
if metalearner == "S":
- assert "rmse" in evaluation
- elif metalearner == "T":
+ assert set(evaluation.keys()) == {"base_model_neg_root_mean_squared_error"}
+ elif metalearner in ["T", "X", "DR"]:
for v in range(n_variants):
- assert f"variant_{v}_rmse" in evaluation
+ assert (
+ f"variant_outcome_model_{v}_neg_root_mean_squared_error"
+ in evaluation
+ )
elif metalearner == "R":
- assert "outcome_rmse" in evaluation
+ assert "outcome_model_neg_root_mean_squared_error" in evaluation
if metalearner == "R":
assert (
{f"r_loss_{i}_vs_0" for i in range(1, n_variants)}
- | {"propensity_cross_entropy"}
+ | {"propensity_model_neg_log_loss"}
+ | {
+ f"treatment_model_{i}_vs_0_neg_root_mean_squared_error"
+ for i in range(1, n_variants)
+ }
) <= set(evaluation.keys())
+ elif metalearner == "X":
+ assert "propensity_model_neg_log_loss" in evaluation
+ for v in range(1, n_variants):
+ assert (
+ f"treatment_effect_model_{v}_vs_0_neg_root_mean_squared_error"
+ in evaluation
+ )
+ assert (
+ f"control_effect_model_{v}_vs_0_neg_root_mean_squared_error"
+ in evaluation
+ )
+ elif metalearner == "DR":
+ assert "propensity_model_neg_log_loss" in evaluation
+ for v in range(1, n_variants):
+ assert f"treatment_model_{v}_vs_0_neg_root_mean_squared_error" in evaluation
+
+
+def new_score(estimator, X, y):
+ # This score doesn't make sense.
+ return np.mean(y - estimator.predict(X))
+
+
+def new_score_2(y, y_pred):
+ # This score doesn't make sense.
+ return np.mean(y - y_pred)
+
+
+@pytest.mark.parametrize(
+ "metalearner, is_classification, scoring, expected_keys",
+ [
+ ("S", True, {"base_model": ["accuracy"]}, {"base_model_accuracy"}),
+ ("S", False, {"base_model": ["max_error"]}, {"base_model_max_error"}),
+ (
+ "T",
+ False,
+ {
+ "variant_outcome_model": [new_score, make_scorer(new_score_2)],
+ "to_ignore": [],
+ },
+ {
+ "variant_outcome_model_0_custom_scorer_0",
+ "variant_outcome_model_0_custom_scorer_1",
+ "variant_outcome_model_1_custom_scorer_0",
+ "variant_outcome_model_1_custom_scorer_1",
+ "variant_outcome_model_2_custom_scorer_0",
+ "variant_outcome_model_2_custom_scorer_1",
+ },
+ ),
+ (
+ "X",
+ True,
+ {
+ "variant_outcome_model": ["f1"],
+ "propensity_model": [],
+ "control_effect_model": [],
+ "treatment_effect_model": ["r2", new_score],
+ },
+ {
+ "variant_outcome_model_0_f1",
+ "variant_outcome_model_1_f1",
+ "variant_outcome_model_2_f1",
+ "treatment_effect_model_1_vs_0_r2",
+ "treatment_effect_model_1_vs_0_custom_scorer_1",
+ "treatment_effect_model_2_vs_0_r2",
+ "treatment_effect_model_2_vs_0_custom_scorer_1",
+ },
+ ),
+ (
+ "R",
+ False,
+ {
+ "outcome_model": [make_scorer(new_score_2)],
+ "propensity_model": [],
+ "treatment_model": ["neg_mean_absolute_error"],
+ },
+ {
+ "outcome_model_custom_scorer_0",
+ "r_loss_1_vs_0",
+ "r_loss_2_vs_0",
+ "treatment_model_1_vs_0_neg_mean_absolute_error",
+ "treatment_model_2_vs_0_neg_mean_absolute_error",
+ },
+ ),
+ (
+ "DR",
+ True,
+ {
+ "variant_outcome_model": ["f1"],
+ "propensity_model": [],
+ "treatment_model": ["r2", new_score],
+ },
+ {
+ "variant_outcome_model_0_f1",
+ "variant_outcome_model_1_f1",
+ "variant_outcome_model_2_f1",
+ "treatment_model_1_vs_0_r2",
+ "treatment_model_1_vs_0_custom_scorer_1",
+ "treatment_model_2_vs_0_r2",
+ "treatment_model_2_vs_0_custom_scorer_1",
+ },
+ ),
+ ],
+)
+@pytest.mark.parametrize("is_oos", [True, False])
+def test_learner_evaluate_scoring(
+ metalearner, is_classification, scoring, expected_keys, is_oos, rng
+):
+ factory = metalearner_factory(metalearner)
+ nuisance_model_factory = _linear_base_learner(is_classification)
+ nuisance_model_params = _linear_base_learner_params(is_classification)
+
+ n_variants = 3
+ sample_size = 1000
+ test_size = 250
+ X = rng.standard_normal((sample_size, 10))
+ X_test = rng.standard_normal((test_size, 10)) if is_oos else X
+ w = rng.integers(0, n_variants, size=sample_size)
+ w_test = rng.integers(0, n_variants, test_size) if is_oos else w
+ if is_classification:
+ y = rng.integers(0, 2, size=sample_size)
+ y_test = rng.integers(0, 2, test_size) if is_oos else y
+ else:
+ y = rng.standard_normal(sample_size)
+ y_test = rng.standard_normal(test_size) if is_oos else y
+
+ ml = factory(
+ is_classification=is_classification,
+ n_variants=n_variants,
+ nuisance_model_factory=nuisance_model_factory,
+ propensity_model_factory=LGBMClassifier,
+ treatment_model_factory=LinearRegression,
+ nuisance_model_params=nuisance_model_params,
+ propensity_model_params={"n_estimators": 1},
+ n_folds=2,
+ )
+ ml.fit(X, y, w)
+ evaluation = ml.evaluate(X_test, y_test, w_test, is_oos, scoring=scoring)
+ assert set(evaluation.keys()) == expected_keys
@pytest.mark.parametrize("outcome_kind", ["binary", "continuous"])