Skip to content

Commit

Permalink
Fix support for sklearn<1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
kbattocchi committed Jul 8, 2022
1 parent b95f594 commit f1bfe22
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 43 deletions.
51 changes: 51 additions & 0 deletions econml/sklearn_extensions/linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,51 @@
from joblib import Parallel, delayed


# TODO: once we drop support for sklearn < 1.0, we can remove this
def _add_normalize(to_wrap):
"""
Add a fictitious "normalize" argument to linear model initializer signatures.
This is necessary for their get_params to play nicely with some other sklearn-internal methods.
Note that directly adding a **params argument to the ordinary initializer will not work,
because get_params explicitly looks only at the initializer signature arguments that are not
varargs or varkeywords, so we need to modify the signature of the initializer to include the
"normalize" argument.
"""
# if we're decorating a class, just update the __init__ method,
# so that the result is still a class instead of a wrapper method
if isinstance(to_wrap, type):
import sklearn
from packaging import version

if version.parse(sklearn.__version__) >= version.parse("1.0"):
# normalize was deprecated or removed; don't need to do anything
return to_wrap

else:
from inspect import Parameter, signature
from functools import wraps

old_init = to_wrap.__init__

@wraps(old_init)
def new_init(self, *args, normalize=False, **kwargs):
if normalize is not False:
warnings.warn("normalize is deprecated and will be ignored", stacklevel=2)
return old_init(self, *args, **kwargs)

sig = signature(old_init)
sig = sig.replace(parameters=[*sig.parameters.values(),
Parameter("normalize", kind=Parameter.KEYWORD_ONLY, default=False)])

new_init.__signature__ = sig
to_wrap.__init__ = new_init
return to_wrap
else:
raise ValueError("This decorator was applied to a method, but is intended to be applied only to types.")


def _weighted_check_cv(cv=5, y=None, classifier=False, random_state=None):
cv = 5 if cv is None else cv
if isinstance(cv, numbers.Integral):
Expand Down Expand Up @@ -131,6 +176,7 @@ def _fit_weighted_linear_model(self, X, y, sample_weight, check_input=None):
super().fit(**fit_params)


@_add_normalize
class WeightedLasso(WeightedModelMixin, Lasso):
"""Version of sklearn Lasso that accepts weights.
Expand Down Expand Up @@ -236,6 +282,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
return self


@_add_normalize
class WeightedMultiTaskLasso(WeightedModelMixin, MultiTaskLasso):
"""Version of sklearn MultiTaskLasso that accepts weights.
Expand Down Expand Up @@ -325,6 +372,7 @@ def fit(self, X, y, sample_weight=None):
return self


@_add_normalize
class WeightedLassoCV(WeightedModelMixin, LassoCV):
"""Version of sklearn LassoCV that accepts weights.
Expand Down Expand Up @@ -443,6 +491,7 @@ def fit(self, X, y, sample_weight=None):
return self


@_add_normalize
class WeightedMultiTaskLassoCV(WeightedModelMixin, MultiTaskLassoCV):
"""Version of sklearn MultiTaskLassoCV that accepts weights.
Expand Down Expand Up @@ -582,6 +631,7 @@ def _get_theta_coefs_and_tau_sq(i, X, sample_weight, alpha_cov, n_alphas_cov, ma
return coefs, tausq


@_add_normalize
class DebiasedLasso(WeightedLasso):
"""Debiased Lasso model.
Expand Down Expand Up @@ -927,6 +977,7 @@ def _get_unscaled_coef_var(self, X, theta_hat, sample_weight):
return _unscaled_coef_var


@_add_normalize
class MultiOutputDebiasedLasso(MultiOutputRegressor):
"""Debiased MultiOutputLasso model.
Expand Down
52 changes: 31 additions & 21 deletions econml/tests/test_dml.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, PolynomialFeatures
from sklearn.model_selection import KFold, GroupKFold
from sklearn.model_selection import KFold, GroupKFold, check_cv
from econml.dml import DML, LinearDML, SparseLinearDML, KernelDML, CausalForestDML
from econml.dml import NonParamDML
import numpy as np
Expand Down Expand Up @@ -1141,27 +1141,37 @@ def test_groups(self):
est.fit(y, t, groups=groups)

# test nested grouping
class NestedModel(LassoCV):
def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
precompute='auto', max_iter=1000, tol=1e-4,
copy_X=True, cv=None, verbose=False, n_jobs=None,
positive=False, random_state=None, selection='cyclic'):

super().__init__(
eps=eps, n_alphas=n_alphas, alphas=alphas,
fit_intercept=fit_intercept,
precompute=precompute, max_iter=max_iter, tol=tol, copy_X=copy_X,
cv=cv, verbose=verbose, n_jobs=n_jobs, positive=positive,
random_state=random_state, selection=selection)
class NestedModel:
def __init__(self, cv):
self.model = LassoCV(cv=cv)

# DML nested CV works via a 'cv' attribute
@property
def cv(self):
return self.model.cv

@cv.setter
def cv(self, value):
self.model.cv = value

def fit(self, X, y):
# ensure that the grouping has worked correctly and we get all 10 copies of the items in
# whichever groups we saw
(yvals, cts) = np.unique(y, return_counts=True)
for (yval, ct) in zip(yvals, cts):
if ct != 10:
raise Exception("Grouping failed; received {0} copies of {1} instead of 10".format(ct, yval))
return super().fit(X, y)
for (train, test) in check_cv(self.cv, y).split(X, y):
(yvals, cts) = np.unique(y[train], return_counts=True)
# with 2-fold outer and 2-fold inner grouping, and six total groups,
# should get 1 or 2 groups per split
if len(yvals) > 2:
raise Exception(f"Grouping failed: received {len(yval)} groups instead of at most 2")

# ensure that the grouping has worked correctly and we get all 10 copies of the items in
# whichever groups we see
for (yval, ct) in zip(yvals, cts):
if ct != 10:
raise Exception(f"Grouping failed; received {ct} copies of {yval} instead of 10")
self.model.fit(X, y)
return self

def predict(self, X):
return self.model.predict(X)

# test nested grouping
est = LinearDML(model_y=NestedModel(cv=2), model_t=NestedModel(cv=2), cv=GroupKFold(2))
Expand All @@ -1170,6 +1180,6 @@ def fit(self, X, y):
# by default, we use 5 split cross-validation for our T and Y models
# but we don't have enough groups here to split both the outer and inner samples with grouping
# TODO: does this imply we should change some defaults to make this more likely to succeed?
est = LinearDML(cv=GroupKFold(2))
est = LinearDML(model_y=LassoCV(cv=5), model_t=LassoCV(cv=5), cv=GroupKFold(2))
with pytest.raises(Exception):
est.fit(y, t, groups=groups)
56 changes: 34 additions & 22 deletions econml/tests/test_drlearner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
from sklearn.base import TransformerMixin
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.exceptions import DataConversionWarning
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, LogisticRegression
from sklearn.model_selection import KFold, GroupKFold
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import KFold, GroupKFold, check_cv
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, PolynomialFeatures

Expand Down Expand Up @@ -799,27 +799,37 @@ def test_groups(self):
est.fit(y, t, W=w, groups=groups)

# test nested grouping
class NestedModel(LassoCV):
def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
precompute='auto', max_iter=1000, tol=1e-4,
copy_X=True, cv=None, verbose=False, n_jobs=None,
positive=False, random_state=None, selection='cyclic'):

super().__init__(
eps=eps, n_alphas=n_alphas, alphas=alphas,
fit_intercept=fit_intercept,
precompute=precompute, max_iter=max_iter, tol=tol, copy_X=copy_X,
cv=cv, verbose=verbose, n_jobs=n_jobs, positive=positive,
random_state=random_state, selection=selection)
class NestedModel:
def __init__(self, cv):
self.model = LassoCV(cv=cv)

# DML nested CV works via a 'cv' attribute
@property
def cv(self):
return self.model.cv

@cv.setter
def cv(self, value):
self.model.cv = value

def fit(self, X, y):
# ensure that the grouping has worked correctly and we get all 10 copies of the items in
# whichever groups we saw
(yvals, cts) = np.unique(y, return_counts=True)
for (yval, ct) in zip(yvals, cts):
if ct != 10:
raise Exception("Grouping failed; received {0} copies of {1} instead of 10".format(ct, yval))
return super().fit(X, y)
for (train, test) in check_cv(self.cv, y).split(X, y):
(yvals, cts) = np.unique(y[train], return_counts=True)
# with 2-fold outer and 2-fold inner grouping, and six total groups,
# should get 1 or 2 groups per split
if len(yvals) > 2:
raise Exception(f"Grouping failed: received {len(yval)} groups instead of at most 2")

# ensure that the grouping has worked correctly and we get all 10 copies of the items in
# whichever groups we see
for (yval, ct) in zip(yvals, cts):
if ct != 10:
raise Exception(f"Grouping failed; received {ct} copies of {yval} instead of 10")
self.model.fit(X, y)
return self

def predict(self, X):
return self.model.predict(X)

# test nested grouping
est = LinearDRLearner(model_propensity=LogisticRegression(),
Expand All @@ -829,7 +839,9 @@ def fit(self, X, y):
# by default, we use 5 split cross-validation for our T and Y models
# but we don't have enough groups here to split both the outer and inner samples with grouping
# TODO: does this imply we should change some defaults to make this more likely to succeed?
est = LinearDRLearner(cv=GroupKFold(2))
est = LinearDRLearner(model_propensity=LogisticRegressionCV(cv=5),
model_regression=LassoCV(cv=5),
cv=GroupKFold(2))
with pytest.raises(Exception):
est.fit(y, t, W=w, groups=groups)

Expand Down
5 changes: 5 additions & 0 deletions econml/tests/test_linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ def setUpClass(cls):
cls.y_2D_consistent = np.concatenate((TestLassoExtensions.y_simple.reshape(-1, 1),
TestLassoExtensions.y2_full.reshape(-1, 1)), axis=1)

def test_can_clone(self):
for model in [WeightedLasso(), WeightedLassoCV(), WeightedMultiTaskLassoCV(),
WeightedLassoCVWrapper(), DebiasedLasso(), MultiOutputDebiasedLasso()]:
clone(model)

#################
# WeightedLasso #
#################
Expand Down

0 comments on commit f1bfe22

Please sign in to comment.