From 85324a63f1a454397d31a5e91cf5c2bb5b646556 Mon Sep 17 00:00:00 2001
From: kgao <kevin.leo.gao@gmail.com>
Date: Tue, 11 Apr 2023 15:12:48 -0400
Subject: [PATCH] Init

Signed-off-by: kgao <kevin.leo.gao@gmail.com>
---
 econml/score/drscorer.py      | 276 ++++++++++++++++++++++++++++++++++
 econml/tests/test_drscorer.py |  88 +++++++++++
 2 files changed, 364 insertions(+)
 create mode 100644 econml/score/drscorer.py
 create mode 100644 econml/tests/test_drscorer.py

diff --git a/econml/score/drscorer.py b/econml/score/drscorer.py
new file mode 100644
index 000000000..a3c1a9876
--- /dev/null
+++ b/econml/score/drscorer.py
@@ -0,0 +1,276 @@
+# Copyright (c) PyWhy contributors. All rights reserved.
+# Licensed under the MIT License.
+
+from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression
+from ..dr import DRLearner
+from sklearn.base import clone
+import numpy as np
+from scipy.special import softmax
+from .ensemble_cate import EnsembleCateEstimator
+
+
+class DRScorer:
+    """ Scorer based on the DRLearner loss. Fits regression model g (using T-Learner) and propensity model p at fit time
+    and calculates the regression and propensity of the evaluation data::
+
+        g (model_regression) = E[Y | X, W, T]
+        
+        p (model_propensity) = Pr[T | X, W]
+
+        Ydr(g,p) = g  + (Y - g ) / p * T
+
+    Then for any given cate model calculates the loss::
+
+        loss(cate) = E_n[(Ydr(g, p) - cate(X))^2]
+
+    Also calculates a baseline loss based on a constant treatment effect model, i.e.::
+
+        base_loss = min_{theta} E_n[(Ydr(g, p) - theta)^2]
+
+    Returns an analogue of the R-square score for regression::
+
+        score = 1 - loss(cate) / base_loss
+
+    This corresponds to the extra variance of the outcome explained by introducing heterogeneity
+    in the effect as captured by the cate model, as opposed to always predicting a constant effect.
+    A negative score, means that the cate model performs even worse than a constant effect model
+    and hints at overfitting during training of the cate model.
+
+    This method was also advocated in recent work of [Schuleretal2018]_ when compared among several alternatives
+    for causal model selection and introduced in the work of [NieWager2017]_.
+
+    Parameters
+    ----------
+    model_propensity : scikit-learn classifier or 'auto', default 'auto'
+        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
+        Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T,
+        where T is a shape (n, ) array.
+        If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV` will be chosen.
+
+    model_regression : scikit-learn regressor or 'auto', default 'auto'
+        Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
+        concatenated. The one-hot-encoding excludes the baseline treatment. Must implement `fit` and
+        `predict` methods. If different models per treatment arm are desired, see the
+        :class:`.MultiModelWrapper` helper class.
+        If 'auto' :class:`.WeightedLassoCV`/:class:`.WeightedMultiTaskLassoCV` will be chosen.
+
+    model_final :
+        estimator for the final cate model. Trained on regressing the doubly robust potential outcomes
+        on (features X).
+
+        - If X is None, then the fit method of model_final should be able to handle X=None.
+        - If featurizer is not None and X is not None, then it is trained on the outcome of
+          featurizer.fit_transform(X).
+        - If multitask_model_final is True, then this model must support multitasking
+          and it is trained by regressing all doubly robust target outcomes on (featurized) features simultanteously.
+        - The output of the predict(X) of the trained model will contain the CATEs for each treatment compared to
+          baseline treatment (lexicographically smallest). If multitask_model_final is False, it is assumed to be a
+          mono-task model and a separate clone of the model is trained for each outcome. Then predict(X) of the t-th
+          clone will be the CATE of the t-th lexicographically ordered treatment compared to the baseline.
+
+    multitask_model_final : bool, default False
+        Whether the model_final should be treated as a multi-task model. See description of model_final.
+
+    featurizer : :term:`transformer`, optional
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    min_propensity : float, default ``1e-6``
+        The minimum propensity at which to clip propensity estimates to avoid dividing by zero.
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    cv: int, cross-validation generator or an iterable, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, default 'mean'
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+
+    References
+    ----------
+    .. [NieWager2017] X. Nie and S. Wager.
+        Quasi-Oracle Estimation of Heterogeneous Treatment Effects.
+        arXiv preprint arXiv:1712.04912, 2017.
+        `<https://arxiv.org/pdf/1712.04912.pdf>`_
+
+    .. [Schuleretal2018] Alejandro Schuler, Michael Baiocchi, Robert Tibshirani, Nigam Shah.
+        "A comparison of methods for model selection when estimating individual treatment effects."
+        Arxiv, 2018
+        `<https://arxiv.org/pdf/1804.05146.pdf>`_
+
+    """
+
+    def __init__(self, *,
+                 model_propensity='auto',
+                 model_regression='auto',
+                 model_final=StatsModelsLinearRegression(),
+                 multitask_model_final=False,
+                 featurizer=None,
+                 min_propensity=1e-6,
+                 categories='auto',
+                 cv=2,
+                 mc_iters=None,
+                 mc_agg='mean',
+                 random_state=None):
+        self.model_propensity = clone(model_propensity, safe=False)
+        self.model_regression = clone(model_regression, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        self.multitask_model_final = multitask_model_final
+        self.featurizer = clone(featurizer, safe=False)
+        self.min_propensity = min_propensity
+        self.categories = categories
+        self.cv = cv
+        self.mc_iters = mc_iters
+        self.mc_agg = mc_agg
+        self.random_state = random_state
+
+    def fit(self, y, T, X=None, W=None, sample_weight=None, groups=None):
+        """
+
+        Parameters
+        ----------
+        Y: (n × d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n × dₜ) matrix or vector of length n
+            Treatments for each sample
+        X:  (n × dₓ) matrix, optional
+            Features for each sample
+        W:  (n × d_w) matrix, optional
+            Controls for each sample
+        sample_weight:  (n,) vector, optional
+            Weights for each row
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+
+        Returns
+        -------
+        self
+        """
+        if X is None:
+            raise ValueError("X cannot be None for the DRScorer!")
+
+        self.drlearner_ = DRLearner(model_propensity=self.model_propensity,
+                                    model_regression=self.model_regression,
+                                    model_final=self.model_final,
+                                    multitask_model_final=self.multitask_model_final,
+                                    featurizer=self.featurizer,
+                                    min_propensity=self.min_propensity,
+                                    categories=self.categories,
+                                    cv=self.cv,
+                                    mc_iters=self.mc_iters,
+                                    mc_agg=self.mc_agg,
+                                    random_state=self.random_state)
+        self.drlearner_.fit(y, T, X=None, W=np.hstack([v for v in [X, W] if v is not None]),
+                            sample_weight=sample_weight, groups=groups, cache_values=True)
+        self.base_score_ = self.drlearner_.score_
+        self.dx_ = X.shape[1]
+        return self
+
+    def score(self, cate_model):
+        """
+        Parameters
+        ----------
+        cate_model : instance of fitted BaseCateEstimator
+
+        Returns
+        -------
+        score : double
+            An analogue of the DR-square loss for the causal setting.
+        """
+        Ydr = self.drlearner_.model_final 
+        X = self.drlearner_._cached_values.W[:, :self.dx_]
+        sample_weight = self.drlearner_._cached_values.sample_weight
+        if Ydr.ndim == 1:
+            Ydr = Ydr.reshape((-1, 1))
+            
+        cate = cate_model.const_marginal_effect(X).reshape((-1, Ydr.shape[1]))
+
+        if sample_weight is not None:
+            return 1 - np.mean(np.average((Ydr - cate)**2, weights=sample_weight, axis=0)) / self.base_score_
+        else:
+            return 1 - np.mean((Ydr - cate) ** 2) / self.base_score_
+
+    def best_model(self, cate_models, return_scores=False):
+        """ Chooses the best among a list of models
+
+        Parameters
+        ----------
+        cate_models : list of instance of fitted BaseCateEstimator
+        return_scores : bool, default False
+            Whether to return the list scores of each model
+        Returns
+        -------
+        best_model : instance of fitted BaseCateEstimator
+            The model that achieves the best score
+        best_score : double
+            The score of the best model
+        scores : list of double
+            The list of scores for each of the input models. Returned only if `return_scores=True`.
+        """
+        rscores = [self.score(mdl) for mdl in cate_models]
+        best = np.nanargmax(rscores)
+        if return_scores:
+            return cate_models[best], rscores[best], rscores
+        else:
+            return cate_models[best], rscores[best]
+
+    def ensemble(self, cate_models, eta=1000.0, return_scores=False):
+        """ Ensembles a list of models based on their performance
+
+        Parameters
+        ----------
+        cate_models : list of instance of fitted BaseCateEstimator
+        eta : double, default 1000
+            The soft-max parameter for the ensemble
+        return_scores : bool, default False
+            Whether to return the list scores of each model
+        Returns
+        -------
+        ensemble_model : instance of fitted EnsembleCateEstimator
+            A fitted ensemble cate model that calculates effects based on a weighted
+            version of the input cate models, weighted by a softmax of their score
+            performance
+        ensemble_score : double
+            The score of the ensemble model
+        scores : list of double
+            The list of scores for each of the input models. Returned only if `return_scores=True`.
+        """
+        drscores = np.array([self.score(mdl) for mdl in cate_models])
+        goodinds = np.isfinite(drscores)
+        weights = softmax(eta * drscores[goodinds])
+        goodmodels = [mdl for mdl, good in zip(cate_models, goodinds) if good]
+        ensemble = EnsembleCateEstimator(cate_models=goodmodels, weights=weights)
+        ensemble_score = self.score(ensemble)
+        if return_scores:
+            return ensemble, ensemble_score, drscores
+        else:
+            return ensemble, ensemble_score
diff --git a/econml/tests/test_drscorer.py b/econml/tests/test_drscorer.py
new file mode 100644
index 000000000..b407a0758
--- /dev/null
+++ b/econml/tests/test_drscorer.py
@@ -0,0 +1,88 @@
+# Copyright (c) PyWhy contributors. All rights reserved.
+# Licensed under the MIT License.
+
+import unittest
+import numpy as np
+
+from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression
+np.set_printoptions(suppress=True)
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.linear_model import LinearRegression, LogisticRegression
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from joblib import Parallel, delayed
+
+from econml.dml import DML, LinearDML, SparseLinearDML, NonParamDML
+from econml.metalearners import XLearner, TLearner, SLearner, DomainAdaptationLearner
+from econml.dr import DRLearner
+from econml.score import DRScorer
+import scipy.special
+
+
+def _fit_model(name, model, Y, T, X):
+    return name, model.fit(Y, T, X=X)
+
+
+class TestDRScorer(unittest.TestCase):
+
+    def _get_data(self):
+        X = np.random.normal(size=(1000, 3))
+        T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
+        sigma = 0.001
+        y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
+        return y, T, X, X[:, 0]
+    
+
+    def test_comparison(self):
+        def reg():
+            return LinearRegression()
+
+        def clf():
+            return LogisticRegression()
+
+        y, T, X, true_eff = self._get_data()
+        (X_train, X_val, T_train, T_val,
+         Y_train, Y_val, _, true_eff_val) = train_test_split(X, T, y, true_eff, test_size=.4)
+
+        models = [('ldml', LinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True,
+                                     linear_first_stages=False, cv=3)),
+                  ('sldml', SparseLinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True,
+                                            featurizer=PolynomialFeatures(degree=2, include_bias=False),
+                                            linear_first_stages=False, cv=3)),
+                  ('xlearner', XLearner(models=reg(), cate_models=reg(), propensity_model=clf())),
+                  ('dalearner', DomainAdaptationLearner(models=reg(), final_models=reg(), propensity_model=clf())),
+                  ('slearner', SLearner(overall_model=reg())),
+                  ('tlearner', TLearner(models=reg())),
+                  ('drlearner', DRLearner(model_propensity='auto',model_regression='auto',
+                                          model_final=reg(), cv=3)),
+                  ('rlearner', NonParamDML(model_y=reg(), model_t=clf(), model_final=reg(),
+                                           discrete_treatment=True, cv=3)),
+                  ('dml3dlasso', DML(model_y=reg(), model_t=clf(), model_final=reg(), discrete_treatment=True,
+                                     featurizer=PolynomialFeatures(degree=3),
+                                     linear_first_stages=False, cv=3))
+                  ]
+
+        models = Parallel(n_jobs=1, verbose=1)(delayed(_fit_model)(name, mdl,
+                                                                   Y_train, T_train, X_train)
+                                               for name, mdl in models)
+
+        scorer = DRScorer(model_propensity='auto',
+                          model_regression='auto',
+                          model_final=StatsModelsLinearRegression(),
+                          multitask_model_final=False,
+                          featurizer=None,
+                          min_propensity=1e-6,
+                          cv=3, 
+                          mc_iters=2, 
+                          mc_agg='median')
+        scorer.fit(Y_val, T_val, X=X_val)
+        rscore = [scorer.score(mdl) for _, mdl in models]
+        rootpehe_score = [np.sqrt(np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2))
+                          for _, mdl in models]
+        assert LinearRegression().fit(np.array(rscore).reshape(-1, 1), np.array(rootpehe_score)).coef_ < 0.5
+        mdl, _ = scorer.best_model([mdl for _, mdl in models])
+        rootpehe_best = np.sqrt(np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2))
+        assert rootpehe_best < 1.5 * np.min(rootpehe_score) + 0.05
+        mdl, _ = scorer.ensemble([mdl for _, mdl in models])
+        rootpehe_ensemble = np.sqrt(np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2))
+        assert rootpehe_ensemble < 1.5 * np.min(rootpehe_score) + 0.05