diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c4ef4d4..4526f78 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,13 @@ Changelog ========= +0.6.0 (2024-06-**) +------------------ + +* Added ``scoring`` parameter to :meth:`metalearners.metalearner.MetaLearner.evaluate` and + implemented the abstract method for the :class:`metalearners.XLearner` and + :class:`metalearners.DRLearner`. + 0.5.0 (2024-06-18) ------------------ diff --git a/metalearners/_typing.py b/metalearners/_typing.py index c23207e..76b31ba 100644 --- a/metalearners/_typing.py +++ b/metalearners/_typing.py @@ -1,7 +1,7 @@ # Copyright (c) QuantCo 2024-2024 # SPDX-License-Identifier: BSD-3-Clause -from collections.abc import Collection, Mapping +from collections.abc import Callable, Collection, Mapping, Sequence from typing import Literal, Protocol, Union import numpy as np @@ -29,7 +29,6 @@ class _ScikitModel(Protocol): # https://stackoverflow.com/questions/54868698/what-type-is-a-sklearn-model/60542986#60542986 def fit(self, X, y, *params, **kwargs): ... - def predict(self, X, *params, **kwargs): ... def score(self, X, y, **kwargs): ... @@ -44,3 +43,7 @@ def set_params(self, **params): ... # For instance, if converting the Generator resulting from a call to # sklearn.model_selection.KFold.split to a list we obtain this type. SplitIndices = list[tuple[np.ndarray, np.ndarray]] + +Scorer = str | Callable +Scorers = Sequence[Scorer] +Scoring = Mapping[str, Scorers] diff --git a/metalearners/_utils.py b/metalearners/_utils.py index 2767d5a..091d0ff 100644 --- a/metalearners/_utils.py +++ b/metalearners/_utils.py @@ -463,3 +463,26 @@ def simplify_output_2d(tensor: np.ndarray) -> np.ndarray: "This function requires a regression or a classification with binary outcome " "task." ) + + +# Taken from https://stackoverflow.com/questions/13741998/is-there-a-way-to-let-classes-inherit-the-documentation-of-their-superclass-with +def copydoc(fromfunc, sep="\n"): + """ + Decorator: Copy the docstring of ``fromfunc`` + """ + + def _decorator(func): + sourcedoc = fromfunc.__doc__ + if func.__doc__ is None: + func.__doc__ = sourcedoc + else: + func.__doc__ = sep.join([sourcedoc, func.__doc__]) + return func + + return _decorator + + +def default_metric(predict_method: PredictMethod) -> str: + if predict_method == _PREDICT_PROBA: + return "neg_log_loss" + return "neg_root_mean_squared_error" diff --git a/metalearners/cross_fit_estimator.py b/metalearners/cross_fit_estimator.py index 3f8624c..6760413 100644 --- a/metalearners/cross_fit_estimator.py +++ b/metalearners/cross_fit_estimator.py @@ -362,12 +362,17 @@ def __init__( self.original_predict_proba = model.predict_proba def __enter__(self): - self.model.predict = partial( # type: ignore + new_predict = partial( self.model.predict, is_oos=self.is_oos, oos_method=self.oos_method ) - self.model.predict_proba = partial( # type: ignore + new_predict.__name__ = "predict" # type: ignore + self.model.predict = new_predict # type: ignore + + new_predict_proba = partial( self.model.predict_proba, is_oos=self.is_oos, oos_method=self.oos_method ) + new_predict_proba.__name__ = "predict_proba" # type: ignore + self.model.predict_proba = new_predict_proba # type: ignore return self.model def __exit__(self, *args): diff --git a/metalearners/drlearner.py b/metalearners/drlearner.py index 4ff15c2..944b33a 100644 --- a/metalearners/drlearner.py +++ b/metalearners/drlearner.py @@ -1,11 +1,12 @@ # Copyright (c) QuantCo 2024-2024 # SPDX-License-Identifier: BSD-3-Clause + import numpy as np from joblib import Parallel, delayed from typing_extensions import Self -from metalearners._typing import Matrix, OosMethod, Vector +from metalearners._typing import Matrix, OosMethod, Scoring, Vector from metalearners._utils import ( clip_element_absolute_value_to_epsilon, get_one, @@ -23,6 +24,7 @@ VARIANT_OUTCOME_MODEL, MetaLearner, _ConditionalAverageOutcomeMetaLearner, + _evaluate_model_kind, _fit_cross_fit_estimator_joblib, _ModelSpecifications, _ParallelJoblibSpecification, @@ -148,6 +150,7 @@ def fit( w=w, y=y, treatment_variant=treatment_variant, + is_oos=False, ) treatment_jobs.append( @@ -205,37 +208,82 @@ def evaluate( w: Vector, is_oos: bool, oos_method: OosMethod = OVERALL, - ) -> dict[str, float | int]: - raise NotImplementedError( - "This feature is not yet implemented for the DR-Learner." + scoring: Scoring | None = None, + ) -> dict[str, float]: + safe_scoring = self._scoring(scoring) + + variant_outcome_evaluation = _evaluate_model_kind( + cfes=self._nuisance_models[VARIANT_OUTCOME_MODEL], + Xs=[X[w == tv] for tv in range(self.n_variants)], + ys=[y[w == tv] for tv in range(self.n_variants)], + scorers=safe_scoring[VARIANT_OUTCOME_MODEL], + model_kind=VARIANT_OUTCOME_MODEL, + is_oos=is_oos, + oos_method=oos_method, + is_treatment_model=False, + ) + + propensity_evaluation = _evaluate_model_kind( + cfes=self._nuisance_models[PROPENSITY_MODEL], + Xs=[X], + ys=[w], + scorers=safe_scoring[PROPENSITY_MODEL], + model_kind=PROPENSITY_MODEL, + is_oos=is_oos, + oos_method=oos_method, + is_treatment_model=False, + ) + + pseudo_outcome: list[np.ndarray] = [] + for treatment_variant in range(1, self.n_variants): + tv_pseudo_outcome = self._pseudo_outcome( + X=X, + y=y, + w=w, + treatment_variant=treatment_variant, + is_oos=is_oos, + oos_method=oos_method, + ) + pseudo_outcome.append(tv_pseudo_outcome) + + treatment_evaluation = _evaluate_model_kind( + self._treatment_models[TREATMENT_MODEL], + Xs=[X for _ in range(1, self.n_variants)], + ys=pseudo_outcome, + scorers=safe_scoring[TREATMENT_MODEL], + model_kind=TREATMENT_MODEL, + is_oos=is_oos, + oos_method=oos_method, + is_treatment_model=True, ) + return variant_outcome_evaluation | propensity_evaluation | treatment_evaluation + def _pseudo_outcome( self, X: Matrix, y: Vector, w: Vector, treatment_variant: int, + is_oos: bool, + oos_method: OosMethod = OVERALL, epsilon: float = _EPSILON, ) -> np.ndarray: - """Compute the DR-Learner pseudo outcome. - - Importantly, this method assumes to be applied on in-sample data. - In other words, ``is_oos`` will always be set to ``False`` when calling - ``predict_nuisance``. - """ + """Compute the DR-Learner pseudo outcome.""" validate_valid_treatment_variant_not_control(treatment_variant, self.n_variants) conditional_average_outcome_estimates = ( self.predict_conditional_average_outcomes( X=X, - is_oos=False, + is_oos=is_oos, + oos_method=oos_method, ) ) propensity_estimates = self.predict_nuisance( X=X, - is_oos=False, + is_oos=is_oos, + oos_method=oos_method, model_kind=PROPENSITY_MODEL, model_ord=0, ) diff --git a/metalearners/metalearner.py b/metalearners/metalearner.py index c9539fd..4bce635 100644 --- a/metalearners/metalearner.py +++ b/metalearners/metalearner.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: BSD-3-Clause from abc import ABC, abstractmethod -from collections.abc import Callable, Collection +from collections.abc import Callable, Collection, Mapping, Sequence from copy import deepcopy from dataclasses import dataclass from typing import TypedDict @@ -10,6 +10,7 @@ import numpy as np import pandas as pd import shap +from sklearn.metrics import get_scorer from sklearn.model_selection import KFold from typing_extensions import Self @@ -20,11 +21,13 @@ OosMethod, Params, PredictMethod, + Scoring, SplitIndices, Vector, _ScikitModel, ) from metalearners._utils import ( + default_metric, index_matrix, validate_model_and_predict_method, validate_number_positive, @@ -32,6 +35,7 @@ from metalearners.cross_fit_estimator import ( OVERALL, CrossFitEstimator, + _PredictContext, ) from metalearners.explainer import Explainer @@ -133,6 +137,49 @@ def _validate_n_folds_synchronize(n_folds: dict[str, int]) -> None: raise ValueError("Need at least two folds to use synchronization.") +def _evaluate_model_kind( + cfes: Sequence[CrossFitEstimator], + Xs: Sequence[Matrix], + ys: Sequence[Vector], + scorers: Sequence[str | Callable], + model_kind: str, + is_oos: bool, + is_treatment_model: bool, + oos_method: OosMethod = OVERALL, + sample_weights: Sequence[Vector] | None = None, +) -> dict[str, float]: + """Helper function to evaluate all the models of the same model kind.""" + prefix = f"{model_kind}_" + evaluation_metrics: dict[str, float] = {} + for idx, scorer in enumerate(scorers): + if isinstance(scorer, str): + scorer_name = scorer + scorer_callable: Callable = get_scorer(scorer) + else: + scorer_name = f"custom_scorer_{idx}" + scorer_callable = scorer + for i, cfe in enumerate(cfes): + if is_treatment_model: + treatment_variant = i + 1 + index_str = f"{treatment_variant}_vs_0_" + else: + if len(cfes) == 1: + index_str = "" + else: + index_str = f"{i}_" + name = f"{prefix}{index_str}{scorer_name}" + with _PredictContext(cfe, is_oos, oos_method) as modified_cfe: + if sample_weights: + evaluation_metrics[name] = scorer_callable( + modified_cfe, Xs[i], ys[i], sample_weight=sample_weights[i] + ) + else: + evaluation_metrics[name] = scorer_callable( + modified_cfe, Xs[i], ys[i] + ) + return evaluation_metrics + + class _ModelSpecifications(TypedDict): # The quotes on MetaLearner are necessary for type hinting as it's not yet defined # here. Check https://stackoverflow.com/questions/55320236/does-python-evaluate-type-hinting-of-a-forward-reference @@ -809,8 +856,40 @@ def evaluate( w: Vector, is_oos: bool, oos_method: OosMethod = OVERALL, - ) -> dict[str, float | int]: - """Evaluate all models contained in a MetaLearner.""" + scoring: Mapping[str, list[str | Callable]] | None = None, + ) -> dict[str, float]: + r"""Evaluate the MetaLearner. + + The keys in ``scoring`` which are not a name of a model contained in the MetaLearner + will be ignored, for information about this names check + :meth:`~metalearners.metalearner.MetaLearner.nuisance_model_specifications` and + :meth:`~metalearners.metalearner.MetaLearner.treatment_model_specifications`. + The values must be a list of: + + * ``string`` representing a ``sklearn`` scoring method. Check + `here `__ + for the possible values. + * ``Callable`` with signature ``scorer(estimator, X, y_true, **kwargs)``. We recommend + using `sklearn.metrics.make_scorer `_ + to create such a ``Callable``. + + If some model name is not present in the keys of ``scoring`` then the default used + metrics will be ``neg_log_loss`` if it is a classifier and ``neg_root_mean_squared_error`` + if it is a regressor. + + The returned dictionary keys have the following structure: + + * For nuisance models: + + * If the cardinality is one: ``f"{model_kind}_{scorer}"`` + * If there is one model for each treatment variant (including control): + ``f"{model_kind}_{treatment_variant}_{scorer}"`` + + * For treatment models: ``f"{model_kind}_{treatment_variant}_vs_0_{scorer}"`` + + Where ``scorer`` is the name of the scorer if it is a string and ``"custom_scorer_{idx}"`` + if it is a callable where ``idx`` is the index in the ``scorers`` list. + """ ... def explainer( @@ -940,6 +1019,27 @@ def shap_values( shap_explainer_params=shap_explainer_params, ) + def _scoring(self, scoring: Scoring | None) -> Scoring: + + def _default_scoring() -> Scoring: + return { + nuisance_model: [ + default_metric(model_specifications["predict_method"](self)) + ] + for nuisance_model, model_specifications in self.nuisance_model_specifications().items() + } | { + treatment_model: [ + default_metric(model_specifications["predict_method"](self)) + ] + for treatment_model, model_specifications in self.treatment_model_specifications().items() + } + + default_scoring = _default_scoring() + + if scoring is None: + return default_scoring + return dict(default_scoring) | dict(scoring) + class _ConditionalAverageOutcomeMetaLearner(MetaLearner, ABC): diff --git a/metalearners/rlearner.py b/metalearners/rlearner.py index c66dcc9..ee6e45d 100644 --- a/metalearners/rlearner.py +++ b/metalearners/rlearner.py @@ -1,14 +1,16 @@ # Copyright (c) QuantCo 2024-2024 # SPDX-License-Identifier: BSD-3-Clause + import numpy as np from joblib import Parallel, delayed -from sklearn.metrics import log_loss, root_mean_squared_error +from sklearn.metrics import root_mean_squared_error from typing_extensions import Self -from metalearners._typing import Matrix, OosMethod, Vector +from metalearners._typing import Matrix, OosMethod, Scoring, Vector from metalearners._utils import ( clip_element_absolute_value_to_epsilon, + copydoc, function_has_argument, get_one, get_predict, @@ -24,6 +26,7 @@ TREATMENT, TREATMENT_MODEL, MetaLearner, + _evaluate_model_kind, _fit_cross_fit_estimator_joblib, _ModelSpecifications, _ParallelJoblibSpecification, @@ -227,6 +230,7 @@ def fit( treatment_variant=treatment_variant, mask=mask, epsilon=epsilon, + is_oos=False, ) X_filtered = index_matrix(X, mask) @@ -323,6 +327,7 @@ def predict( tau_hat[variant_indices, treatment_variant - 1] = variant_estimates return tau_hat + @copydoc(MetaLearner.evaluate, sep="\n\t") def evaluate( self, X: Matrix, @@ -330,7 +335,37 @@ def evaluate( w: Vector, is_oos: bool, oos_method: OosMethod = OVERALL, - ) -> dict[str, float | int]: + scoring: Scoring | None = None, + ) -> dict[str, float]: + """In the RLearner case, the ``"treatment_model"`` is always evaluated with the + :func:`~metalearners.rlearner.r_loss` besides the scorers in + ``scoring["treatment_model"]``, which should support passing the + ``sample_weight`` keyword argument.""" + safe_scoring = self._scoring(scoring) + + propensity_evaluation = _evaluate_model_kind( + cfes=self._nuisance_models[PROPENSITY_MODEL], + Xs=[X], + ys=[w], + scorers=safe_scoring[PROPENSITY_MODEL], + model_kind=PROPENSITY_MODEL, + is_oos=is_oos, + oos_method=oos_method, + is_treatment_model=False, + ) + + outcome_evaluation = _evaluate_model_kind( + cfes=self._nuisance_models[OUTCOME_MODEL], + Xs=[X], + ys=[y], + scorers=safe_scoring[OUTCOME_MODEL], + model_kind=OUTCOME_MODEL, + is_oos=is_oos, + oos_method=oos_method, + is_treatment_model=False, + ) + + # TODO: improve this? generalize it to other metalearners? w_hat = self.predict_nuisance( X=X, is_oos=is_oos, @@ -338,7 +373,6 @@ def evaluate( model_kind=PROPENSITY_MODEL, model_ord=0, ) - propensity_evaluation = {"propensity_cross_entropy": log_loss(w, w_hat)} y_hat = self.predict_nuisance( X=X, @@ -350,13 +384,39 @@ def evaluate( if self.is_classification: y_hat = y_hat[:, 1] - outcome_evaluation = ( - {"outcome_log_loss": log_loss(y, y_hat)} - if self.is_classification - else {"outcome_rmse": root_mean_squared_error(y, y_hat)} + pseudo_outcome: list[np.ndarray] = [] + sample_weights: list[np.ndarray] = [] + masks: list[Vector] = [] + is_control = w == 0 + for treatment_variant in range(1, self.n_variants): + is_treatment = w == treatment_variant + mask = is_treatment | is_control + tv_pseudo_outcome, tv_sample_weights = self._pseudo_outcome_and_weights( + X=X, + y=y, + w=w, + treatment_variant=treatment_variant, + is_oos=is_oos, + oos_method=oos_method, + mask=mask, + ) + pseudo_outcome.append(tv_pseudo_outcome) + sample_weights.append(tv_sample_weights) + masks.append(mask) + + treatment_evaluation = _evaluate_model_kind( + self._treatment_models[TREATMENT_MODEL], + Xs=[X[masks[tv - 1]] for tv in range(1, self.n_variants)], + ys=pseudo_outcome, + scorers=safe_scoring[TREATMENT_MODEL], + model_kind=TREATMENT_MODEL, + is_oos=is_oos, + oos_method=oos_method, + is_treatment_model=True, + sample_weights=sample_weights, ) - treatment_evaluation = {} + rloss_evaluation = {} tau_hat = self.predict(X=X, is_oos=is_oos, oos_method=oos_method) is_control = w == 0 for treatment_variant in range(1, self.n_variants): @@ -371,15 +431,19 @@ def evaluate( if self.is_classification else tau_hat[:, treatment_variant - 1, 0] ) - treatment_evaluation[f"r_loss_{treatment_variant}_vs_0"] = r_loss( + rloss_evaluation[f"r_loss_{treatment_variant}_vs_0"] = r_loss( cate_estimates=cate_estimates[mask], outcome_estimates=y_hat[mask], propensity_scores=propensity_estimates[mask], outcomes=y[mask], treatments=w[mask] == treatment_variant, ) - - return propensity_evaluation | outcome_evaluation | treatment_evaluation + return ( + propensity_evaluation + | outcome_evaluation + | rloss_evaluation + | treatment_evaluation + ) def _pseudo_outcome_and_weights( self, @@ -387,15 +451,13 @@ def _pseudo_outcome_and_weights( y: Vector, w: Vector, treatment_variant: int, + is_oos: bool, + oos_method: OosMethod = OVERALL, mask: Vector | None = None, epsilon: float = _EPSILON, ) -> tuple[np.ndarray, np.ndarray]: """Compute the R-Learner pseudo outcome and corresponding weights. - Importantly, this method assumes to be applied on in-sample data. - In other words, ``is_oos`` will always be set to ``False`` when calling - ``predict_nuisance``. - If ``mask`` is provided, the retuned pseudo outcomes and weights are only with respect the observations that the mask selects. @@ -411,12 +473,17 @@ def _pseudo_outcome_and_weights( # be able to match original observations with their corresponding folds. y_estimates = self.predict_nuisance( X=X, - is_oos=False, + is_oos=is_oos, model_kind=OUTCOME_MODEL, model_ord=0, + oos_method=oos_method, )[mask] w_estimates = self.predict_nuisance( - X=X, is_oos=False, model_kind=PROPENSITY_MODEL, model_ord=0 + X=X, + is_oos=is_oos, + model_kind=PROPENSITY_MODEL, + model_ord=0, + oos_method=oos_method, )[mask] w_estimates_binarized = w_estimates[:, treatment_variant] / ( w_estimates[:, 0] + w_estimates[:, treatment_variant] diff --git a/metalearners/slearner.py b/metalearners/slearner.py index 9d42522..9d49e20 100644 --- a/metalearners/slearner.py +++ b/metalearners/slearner.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd -from sklearn.metrics import log_loss, root_mean_squared_error from typing_extensions import Self from metalearners._typing import ( @@ -14,6 +13,7 @@ ModelFactory, OosMethod, Params, + Scoring, Vector, _ScikitModel, ) @@ -23,7 +23,12 @@ supports_categoricals, ) from metalearners.cross_fit_estimator import OVERALL, CrossFitEstimator -from metalearners.metalearner import NUISANCE, MetaLearner, _ModelSpecifications +from metalearners.metalearner import ( + NUISANCE, + MetaLearner, + _evaluate_model_kind, + _ModelSpecifications, +) _BASE_MODEL = "base_model" @@ -191,17 +196,23 @@ def evaluate( w: Vector, is_oos: bool, oos_method: OosMethod = OVERALL, - ) -> dict[str, float | int]: - # TODO: Parameterize evaluation approaches. + scoring: Scoring | None = None, + ) -> dict[str, float]: + safe_scoring = self._scoring(scoring) + X_with_w = _append_treatment_to_covariates( X, w, self._supports_categoricals, self.n_variants ) - y_pred = self.predict_nuisance( - X=X_with_w, model_kind=_BASE_MODEL, model_ord=0, is_oos=is_oos + return _evaluate_model_kind( + cfes=self._nuisance_models[_BASE_MODEL], + Xs=[X_with_w], + ys=[y], + scorers=safe_scoring[_BASE_MODEL], + model_kind=_BASE_MODEL, + is_oos=is_oos, + oos_method=oos_method, + is_treatment_model=False, ) - if self.is_classification: - return {"cross_entropy": log_loss(y, y_pred)} - return {"rmse": root_mean_squared_error(y, y_pred)} def predict_conditional_average_outcomes( self, X: Matrix, is_oos: bool, oos_method: OosMethod = OVERALL diff --git a/metalearners/tlearner.py b/metalearners/tlearner.py index f4673c6..81f4ae4 100644 --- a/metalearners/tlearner.py +++ b/metalearners/tlearner.py @@ -3,10 +3,9 @@ import numpy as np from joblib import Parallel, delayed -from sklearn.metrics import log_loss, root_mean_squared_error from typing_extensions import Self -from metalearners._typing import Matrix, OosMethod, Vector +from metalearners._typing import Matrix, OosMethod, Scoring, Vector from metalearners._utils import index_matrix from metalearners.cross_fit_estimator import OVERALL from metalearners.metalearner import ( @@ -14,6 +13,7 @@ VARIANT_OUTCOME_MODEL, MetaLearner, _ConditionalAverageOutcomeMetaLearner, + _evaluate_model_kind, _fit_cross_fit_estimator_joblib, _ModelSpecifications, _ParallelJoblibSpecification, @@ -113,21 +113,17 @@ def evaluate( w: Vector, is_oos: bool, oos_method: OosMethod = OVERALL, - ) -> dict[str, float | int]: - # TODO: Parametrize evaluation approaches. - conditional_average_outcomes = self.predict_conditional_average_outcomes( - X=X, is_oos=is_oos, oos_method=oos_method + scoring: Scoring | None = None, + ) -> dict[str, float]: + safe_scoring = self._scoring(scoring) + + return _evaluate_model_kind( + cfes=self._nuisance_models[VARIANT_OUTCOME_MODEL], + Xs=[X[w == tv] for tv in range(self.n_variants)], + ys=[y[w == tv] for tv in range(self.n_variants)], + scorers=safe_scoring[VARIANT_OUTCOME_MODEL], + model_kind=VARIANT_OUTCOME_MODEL, + is_oos=is_oos, + oos_method=oos_method, + is_treatment_model=False, ) - evaluation_metrics = {} - for treatment_variant in range(self.n_variants): - prefix = f"variant_{treatment_variant}" - variant_outcomes = conditional_average_outcomes[:, treatment_variant] - if self.is_classification: - evaluation_metrics[f"{prefix}_cross_entropy"] = log_loss( - y[w == treatment_variant], variant_outcomes[w == treatment_variant] - ) - else: - evaluation_metrics[f"{prefix}_rmse"] = root_mean_squared_error( - y[w == treatment_variant], variant_outcomes[w == treatment_variant] - ) - return evaluation_metrics diff --git a/metalearners/xlearner.py b/metalearners/xlearner.py index ab62926..1ef2539 100644 --- a/metalearners/xlearner.py +++ b/metalearners/xlearner.py @@ -1,11 +1,12 @@ # Copyright (c) QuantCo 2024-2024 # SPDX-License-Identifier: BSD-3-Clause + import numpy as np from joblib import Parallel, delayed from typing_extensions import Self -from metalearners._typing import Matrix, OosMethod, Vector +from metalearners._typing import Matrix, OosMethod, Scoring, Vector from metalearners._utils import ( get_one, get_predict, @@ -21,6 +22,7 @@ VARIANT_OUTCOME_MODEL, MetaLearner, _ConditionalAverageOutcomeMetaLearner, + _evaluate_model_kind, _fit_cross_fit_estimator_joblib, _ModelSpecifications, _ParallelJoblibSpecification, @@ -285,9 +287,68 @@ def evaluate( w: Vector, is_oos: bool, oos_method: OosMethod = OVERALL, - ) -> dict[str, float | int]: - raise NotImplementedError( - "This feature is not yet implemented for the X-Learner." + scoring: Scoring | None = None, + ) -> dict[str, float]: + safe_scoring = self._scoring(scoring) + + variant_outcome_evaluation = _evaluate_model_kind( + cfes=self._nuisance_models[VARIANT_OUTCOME_MODEL], + Xs=[X[w == tv] for tv in range(self.n_variants)], + ys=[y[w == tv] for tv in range(self.n_variants)], + scorers=safe_scoring[VARIANT_OUTCOME_MODEL], + model_kind=VARIANT_OUTCOME_MODEL, + is_oos=is_oos, + oos_method=oos_method, + is_treatment_model=False, + ) + + propensity_evaluation = _evaluate_model_kind( + cfes=self._nuisance_models[PROPENSITY_MODEL], + Xs=[X], + ys=[w], + scorers=safe_scoring[PROPENSITY_MODEL], + model_kind=PROPENSITY_MODEL, + is_oos=is_oos, + oos_method=oos_method, + is_treatment_model=False, + ) + + imputed_te_control: list[np.ndarray] = [] + imputed_te_treatment: list[np.ndarray] = [] + for treatment_variant in range(1, self.n_variants): + tv_imputed_te_control, tv_imputed_te_treatment = self._pseudo_outcome( + X, y, w, treatment_variant + ) + imputed_te_control.append(tv_imputed_te_control) + imputed_te_treatment.append(tv_imputed_te_treatment) + + te_treatment_evaluation = _evaluate_model_kind( + self._treatment_models[TREATMENT_EFFECT_MODEL], + Xs=[X[w == tv] for tv in range(1, self.n_variants)], + ys=imputed_te_treatment, + scorers=safe_scoring[TREATMENT_EFFECT_MODEL], + model_kind=TREATMENT_EFFECT_MODEL, + is_oos=is_oos, + oos_method=oos_method, + is_treatment_model=True, + ) + + te_control_evaluation = _evaluate_model_kind( + self._treatment_models[CONTROL_EFFECT_MODEL], + Xs=[X[w == 0] for _ in range(1, self.n_variants)], + ys=imputed_te_control, + scorers=safe_scoring[CONTROL_EFFECT_MODEL], + model_kind=CONTROL_EFFECT_MODEL, + is_oos=is_oos, + oos_method=oos_method, + is_treatment_model=True, + ) + + return ( + variant_outcome_evaluation + | propensity_evaluation + | te_treatment_evaluation + | te_control_evaluation ) def _pseudo_outcome( diff --git a/tests/test_learner.py b/tests/test_learner.py index efe36c7..f001eda 100644 --- a/tests/test_learner.py +++ b/tests/test_learner.py @@ -5,7 +5,7 @@ import pytest from lightgbm import LGBMClassifier, LGBMRegressor from sklearn.linear_model import LinearRegression, LogisticRegression -from sklearn.metrics import root_mean_squared_error +from sklearn.metrics import make_scorer, root_mean_squared_error from sklearn.model_selection import train_test_split from metalearners.cross_fit_estimator import _OOS_WHITELIST @@ -309,11 +309,14 @@ def test_learner_twins(metalearner, reference_value, twins_data, rng): assert rmse < reference_value * (1 + _OOS_REFERENCE_VALUE_TOLERANCE) -@pytest.mark.parametrize("metalearner", ["S", "T", "R"]) +@pytest.mark.parametrize("metalearner", ["S", "T", "X", "R", "DR"]) @pytest.mark.parametrize("n_classes", [2, 5, 10]) @pytest.mark.parametrize("n_variants", [2, 5]) @pytest.mark.parametrize("is_classification", [True, False]) -def test_learner_evaluate(metalearner, is_classification, rng, n_classes, n_variants): +@pytest.mark.parametrize("is_oos", [True, False]) +def test_learner_evaluate( + metalearner, is_classification, rng, n_classes, n_variants, is_oos +): sample_size = 1000 factory = metalearner_factory(metalearner) if n_variants > 2 and not factory._supports_multi_treatment(): @@ -322,12 +325,17 @@ def test_learner_evaluate(metalearner, is_classification, rng, n_classes, n_vari pytest.skip() # skip repeated tests if is_classification and n_classes > 2 and not factory._supports_multi_class(): pytest.skip() + test_size = 250 X = rng.standard_normal((sample_size, 10)) + X_test = rng.standard_normal((test_size, 10)) if is_oos else X w = rng.integers(0, n_variants, size=sample_size) + w_test = rng.integers(0, n_variants, test_size) if is_oos else w if is_classification: y = rng.integers(0, n_classes, size=sample_size) + y_test = rng.integers(0, n_classes, test_size) if is_oos else y else: y = rng.standard_normal(sample_size) + y_test = rng.standard_normal(test_size) if is_oos else y base_learner = _linear_base_learner(is_classification) @@ -340,28 +348,173 @@ def test_learner_evaluate(metalearner, is_classification, rng, n_classes, n_vari n_folds=2, ) learner.fit(X=X, y=y, w=w) - evaluation = learner.evaluate(X=X, y=y, w=w, is_oos=False) + evaluation = learner.evaluate(X=X_test, y=y_test, w=w_test, is_oos=is_oos) if is_classification: if metalearner == "S": - assert "cross_entropy" in evaluation - elif metalearner == "T": + assert set(evaluation.keys()) == {"base_model_neg_log_loss"} + elif metalearner in ["T", "X", "DR"]: for v in range(n_variants): - assert f"variant_{v}_cross_entropy" in evaluation + assert f"variant_outcome_model_{v}_neg_log_loss" in evaluation elif metalearner == "R": - assert "outcome_log_loss" in evaluation + assert "outcome_model_neg_log_loss" in evaluation else: if metalearner == "S": - assert "rmse" in evaluation - elif metalearner == "T": + assert set(evaluation.keys()) == {"base_model_neg_root_mean_squared_error"} + elif metalearner in ["T", "X", "DR"]: for v in range(n_variants): - assert f"variant_{v}_rmse" in evaluation + assert ( + f"variant_outcome_model_{v}_neg_root_mean_squared_error" + in evaluation + ) elif metalearner == "R": - assert "outcome_rmse" in evaluation + assert "outcome_model_neg_root_mean_squared_error" in evaluation if metalearner == "R": assert ( {f"r_loss_{i}_vs_0" for i in range(1, n_variants)} - | {"propensity_cross_entropy"} + | {"propensity_model_neg_log_loss"} + | { + f"treatment_model_{i}_vs_0_neg_root_mean_squared_error" + for i in range(1, n_variants) + } ) <= set(evaluation.keys()) + elif metalearner == "X": + assert "propensity_model_neg_log_loss" in evaluation + for v in range(1, n_variants): + assert ( + f"treatment_effect_model_{v}_vs_0_neg_root_mean_squared_error" + in evaluation + ) + assert ( + f"control_effect_model_{v}_vs_0_neg_root_mean_squared_error" + in evaluation + ) + elif metalearner == "DR": + assert "propensity_model_neg_log_loss" in evaluation + for v in range(1, n_variants): + assert f"treatment_model_{v}_vs_0_neg_root_mean_squared_error" in evaluation + + +def new_score(estimator, X, y): + # This score doesn't make sense. + return np.mean(y - estimator.predict(X)) + + +def new_score_2(y, y_pred): + # This score doesn't make sense. + return np.mean(y - y_pred) + + +@pytest.mark.parametrize( + "metalearner, is_classification, scoring, expected_keys", + [ + ("S", True, {"base_model": ["accuracy"]}, {"base_model_accuracy"}), + ("S", False, {"base_model": ["max_error"]}, {"base_model_max_error"}), + ( + "T", + False, + { + "variant_outcome_model": [new_score, make_scorer(new_score_2)], + "to_ignore": [], + }, + { + "variant_outcome_model_0_custom_scorer_0", + "variant_outcome_model_0_custom_scorer_1", + "variant_outcome_model_1_custom_scorer_0", + "variant_outcome_model_1_custom_scorer_1", + "variant_outcome_model_2_custom_scorer_0", + "variant_outcome_model_2_custom_scorer_1", + }, + ), + ( + "X", + True, + { + "variant_outcome_model": ["f1"], + "propensity_model": [], + "control_effect_model": [], + "treatment_effect_model": ["r2", new_score], + }, + { + "variant_outcome_model_0_f1", + "variant_outcome_model_1_f1", + "variant_outcome_model_2_f1", + "treatment_effect_model_1_vs_0_r2", + "treatment_effect_model_1_vs_0_custom_scorer_1", + "treatment_effect_model_2_vs_0_r2", + "treatment_effect_model_2_vs_0_custom_scorer_1", + }, + ), + ( + "R", + False, + { + "outcome_model": [make_scorer(new_score_2)], + "propensity_model": [], + "treatment_model": ["neg_mean_absolute_error"], + }, + { + "outcome_model_custom_scorer_0", + "r_loss_1_vs_0", + "r_loss_2_vs_0", + "treatment_model_1_vs_0_neg_mean_absolute_error", + "treatment_model_2_vs_0_neg_mean_absolute_error", + }, + ), + ( + "DR", + True, + { + "variant_outcome_model": ["f1"], + "propensity_model": [], + "treatment_model": ["r2", new_score], + }, + { + "variant_outcome_model_0_f1", + "variant_outcome_model_1_f1", + "variant_outcome_model_2_f1", + "treatment_model_1_vs_0_r2", + "treatment_model_1_vs_0_custom_scorer_1", + "treatment_model_2_vs_0_r2", + "treatment_model_2_vs_0_custom_scorer_1", + }, + ), + ], +) +@pytest.mark.parametrize("is_oos", [True, False]) +def test_learner_evaluate_scoring( + metalearner, is_classification, scoring, expected_keys, is_oos, rng +): + factory = metalearner_factory(metalearner) + nuisance_model_factory = _linear_base_learner(is_classification) + nuisance_model_params = _linear_base_learner_params(is_classification) + + n_variants = 3 + sample_size = 1000 + test_size = 250 + X = rng.standard_normal((sample_size, 10)) + X_test = rng.standard_normal((test_size, 10)) if is_oos else X + w = rng.integers(0, n_variants, size=sample_size) + w_test = rng.integers(0, n_variants, test_size) if is_oos else w + if is_classification: + y = rng.integers(0, 2, size=sample_size) + y_test = rng.integers(0, 2, test_size) if is_oos else y + else: + y = rng.standard_normal(sample_size) + y_test = rng.standard_normal(test_size) if is_oos else y + + ml = factory( + is_classification=is_classification, + n_variants=n_variants, + nuisance_model_factory=nuisance_model_factory, + propensity_model_factory=LGBMClassifier, + treatment_model_factory=LinearRegression, + nuisance_model_params=nuisance_model_params, + propensity_model_params={"n_estimators": 1}, + n_folds=2, + ) + ml.fit(X, y, w) + evaluation = ml.evaluate(X_test, y_test, w_test, is_oos, scoring=scoring) + assert set(evaluation.keys()) == expected_keys @pytest.mark.parametrize("outcome_kind", ["binary", "continuous"])