Merge pull request #17 from Orange-OpenSource/check-regul-scaling

Better tests for l2 alpha scaling between different sklearn estimators
Orange-OpenSource · Feb 25, 2025 · 6bb8e8b · 6bb8e8b
2 parents fc8781c + a9f4021
commit 6bb8e8b
Show file tree

Hide file tree

Showing 4 changed files with 243 additions and 33 deletions.
diff --git a/mislabeled/probe/_linear.py b/mislabeled/probe/_linear.py
@@ -213,7 +213,6 @@ def linearize_pipeline(estimator, X, y):
     return linearize(estimator[-1], X, y)
 
 
-@linearize.register(SGDRegressor)
 @linearize.register(Ridge)
 @linearize.register(RidgeCV)
 @linearize.register(RidgeClassifier)
@@ -254,11 +253,39 @@ def linearize_linear_model_ridge(estimator, X, y):
 
 
 @linearize.register(SGDClassifier)
-def linearize_linear_model_sgdclassifier(estimator, X, y):
-    X, y = check_X_y(X, y, accept_sparse=True, dtype=[np.float64, np.float32])
+@linearize.register(SGDRegressor)
+def linearize_linear_model_sgd(estimator, X, y):
+    X, y = check_X_y(
+        X,
+        y,
+        multi_output=is_regressor(estimator),
+        accept_sparse=True,
+        dtype=[np.float64, np.float32],
+    )
+
     coef = estimator.coef_.T
     intercept = estimator.intercept_ if estimator.fit_intercept else None
-    linear = LinearModel(coef, intercept, loss=estimator.loss, regul=estimator.alpha)
+
+    if is_classifier(estimator) and estimator.loss == "squared_error":
+        lb = LabelBinarizer(pos_label=1, neg_label=-1)
+        y = lb.fit_transform(y)
+
+    if is_regressor(estimator):
+        if y.ndim == 1:
+            y = y.reshape(-1, 1)
+        if coef.ndim == 1:
+            coef = coef.reshape(-1, 1)
+
+    if estimator.penalty is None:
+        regul = None
+    elif estimator.penalty == "l2":
+        regul = estimator.alpha * X.shape[0]
+    else:
+        raise NotImplementedError("lasso not implemented yet.")
+
+    loss = "l2" if estimator.loss == "squared_error" else estimator.loss
+
+    linear = LinearModel(coef, intercept, loss=loss, regul=regul)
     return linear, X, y
 
 
@@ -272,11 +299,11 @@ def linearize_linear_model_logreg(estimator, X, y):
         regul = None
     elif estimator.penalty == "l2":
         if hasattr(estimator, "C_"):
-            regul = 1.0 / (2.0 * estimator.C_)
+            regul = 1.0 / (estimator.C_)
         else:
-            regul = 1.0 / (2.0 * estimator.C)
+            regul = 1.0 / (estimator.C)
     else:
-        raise NotImplementedError()
+        raise NotImplementedError("lasso not implemented yet.")
 
     linear = LinearModel(coef, intercept, loss="log_loss", regul=regul)
     return linear, X, y
@@ -338,7 +365,19 @@ def linearize_mlp(estimator, X, y):
         if y.ndim == 1:
             y = y.reshape(-1, 1)
 
-    linear = LinearModel(coef, intercept, loss=loss, regul=estimator.alpha)
+    if estimator.solver == "lbfgs":
+        batch_size = X.shape[0]
+    elif estimator.batch_size == "auto":
+        batch_size = min(200, X.shape[0])
+    else:
+        batch_size = estimator.batch_size
+
+    linear = LinearModel(
+        coef,
+        intercept,
+        loss=loss,
+        regul=estimator.alpha * batch_size / X.shape[0],
+    )
 
     return linear, activation, y
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -55,7 +55,8 @@ extra-args =  [
 extra-dependencies = [
   "mislabeled[examples]",
   "pytest-benchmark",
-  "scipy>=1.15.0"
+  "scipy>=1.15.0",
+  "statsmodels"
 ]
 
 [tool.coverage.run]

diff --git a/tests/probe/test_linear.py b/tests/probe/test_linear.py
@@ -1,3 +1,5 @@
+import math
+
 import numpy as np
 import pytest
 from scipy.differentiate import hessian, jacobian
@@ -11,9 +13,10 @@
     SGDClassifier,
     SGDRegressor,
 )
+from sklearn.neural_network import MLPClassifier, MLPRegressor
 from sklearn.preprocessing import StandardScaler
 
-from mislabeled.probe import linearize
+from mislabeled.probe import ParamNorm2, linearize
 
 
 @pytest.mark.parametrize(
@@ -35,13 +38,7 @@
         SGDClassifier(loss="log_loss", fit_intercept=True),
     ],
 )
-@pytest.mark.parametrize(
-    "num_classes",
-    [
-        2,
-        3,
-    ],
-)
+@pytest.mark.parametrize("num_classes", [2, 3])
 def test_grad_hess(model, num_classes):
     if is_classifier(model):
         X, y = make_blobs(n_samples=100, random_state=1, centers=num_classes)
@@ -107,3 +104,142 @@ def f(prc):
         atol=1e-3,  # this one is good
         strict=True,
     )
+
+
+@pytest.mark.parametrize("num_samples", [100, 1_000])
+@pytest.mark.parametrize("num_classes", [2, 10])
+@pytest.mark.parametrize("alpha", [1e-2, 1, 1e2])
+def test_l2_regul_clf(num_samples, num_classes, alpha):
+    X, y = make_blobs(
+        n_samples=num_samples,
+        n_features=2,
+        cluster_std=0.1,
+        centers=num_classes,
+        random_state=1,
+    )
+    X = StandardScaler().fit_transform(X)
+
+    models = [
+        lambda alpha: LogisticRegression(
+            random_state=1,
+            C=1 / alpha,
+            max_iter=10000,
+            tol=1e-8,
+        ),
+        lambda alpha: MLPClassifier(
+            hidden_layer_sizes=(),
+            solver="sgd",
+            shuffle=False,
+            random_state=1,
+            learning_rate_init=0.1 * num_classes,
+            max_iter=100000,
+            n_iter_no_change=10000,
+            tol=1e-8,
+            learning_rate="constant",
+            alpha=alpha,
+            batch_size=X.shape[0],
+        ),
+    ]
+    if num_classes == 2:
+        models += [
+            lambda alpha: SGDClassifier(
+                loss="log_loss",
+                learning_rate="constant",
+                eta0=0.1 * num_classes,
+                tol=1e-8,
+                shuffle=False,
+                random_state=1,
+                max_iter=100000,
+                n_iter_no_change=10000,
+                alpha=alpha / X.shape[0],
+                n_jobs=-1,
+            )
+        ]
+
+    models = [model(alpha).fit(X, y) for model in models]
+    norms = [ParamNorm2()(model, X, y).item() for model in models]
+
+    assert math.isclose(min(norms), max(norms), rel_tol=0.1)
+
+
+@pytest.mark.parametrize("num_samples", [100, 1_000])
+@pytest.mark.parametrize("num_classes", [2, 10])
+@pytest.mark.parametrize("alpha", [1e-2, 1, 1e2])
+def test_l2_regul_clf_as_reg(num_samples, num_classes, alpha):
+    X, y = make_blobs(
+        n_samples=num_samples,
+        n_features=2,
+        cluster_std=0.1,
+        centers=num_classes,
+        random_state=1,
+    )
+    X = StandardScaler().fit_transform(X)
+
+    models = [
+        lambda alpha: RidgeClassifier(
+            random_state=1,
+            alpha=alpha,
+            max_iter=10000,
+            tol=1e-8,
+        ),
+        lambda alpha: SGDClassifier(
+            loss="squared_error",
+            learning_rate="constant",
+            eta0=0.0001,
+            tol=1e-8,
+            shuffle=False,
+            random_state=1,
+            max_iter=100000,
+            n_iter_no_change=10000,
+            alpha=alpha / X.shape[0],
+            n_jobs=-1,
+        ),
+    ]
+
+    models = [model(alpha).fit(X, y) for model in models]
+    norms = [ParamNorm2()(model, X, y).item() for model in models]
+
+    assert math.isclose(min(norms), max(norms), rel_tol=0.01)
+
+
+@pytest.mark.parametrize("num_samples", [100, 1_000, 10_000])
+@pytest.mark.parametrize("alpha", [1e-2, 1, 1e2])
+def test_l2_regul_reg(num_samples, alpha):
+    X, y = make_regression(n_samples=num_samples, n_features=2, random_state=1)
+    X = StandardScaler().fit_transform(X)
+
+    models = [
+        lambda alpha: Ridge(
+            random_state=1,
+            alpha=alpha,
+            solver="cholesky",
+            max_iter=10000,
+            tol=1e-8,
+        ),
+        lambda alpha: SGDRegressor(
+            learning_rate="constant",
+            eta0=0.00001,
+            tol=1e-10,
+            shuffle=False,
+            random_state=1,
+            max_iter=10000,
+            n_iter_no_change=1000,
+            alpha=alpha / X.shape[0],
+        ),
+        lambda alpha: MLPRegressor(
+            hidden_layer_sizes=(),
+            solver="sgd",
+            shuffle=False,
+            random_state=1,
+            max_iter=10000,
+            tol=1e-8,
+            learning_rate="constant",
+            alpha=alpha,
+            batch_size=X.shape[0],
+        ),
+    ]
+
+    models = [model(alpha).fit(X, y) for model in models]
+    norms = [ParamNorm2()(model, X, y).item() for model in models]
+
+    assert math.isclose(min(norms), max(norms), rel_tol=0.001)
diff --git a/tests/probe/test_self_influence.py b/tests/probe/test_self_influence.py
@@ -25,23 +25,25 @@
 from sklearn.metrics import log_loss, mean_squared_error
 from sklearn.model_selection import LeaveOneOut
 from sklearn.preprocessing import LabelBinarizer, StandardScaler
+from statsmodels.genmod import families
+from statsmodels.genmod.generalized_linear_model import GLM
 
-from mislabeled.probe._influence import ApproximateLOO, SelfInfluence
+from mislabeled.probe import ApproximateLOO, SelfInfluence, linearize
 
 
 @pytest.mark.parametrize(
     "model",
     [
-        RidgeClassifier(fit_intercept=False, alpha=1e-4),
-        RidgeClassifier(fit_intercept=False, alpha=1e4),
-        RidgeClassifier(fit_intercept=False, alpha=1e-4),
+        RidgeClassifier(fit_intercept=False, alpha=1e-2),
+        RidgeClassifier(fit_intercept=False, alpha=1e2),
+        RidgeClassifier(fit_intercept=False),
         RidgeClassifier(fit_intercept=True),
-        LogisticRegression(fit_intercept=False),
-        LogisticRegression(fit_intercept=False, C=1e-4),
-        LogisticRegression(fit_intercept=False),
-        # LogisticRegression(fit_intercept=True),
+        # LogisticRegression(fit_intercept=True, max_iter=10000, tol=1e-8),
+        # LogisticRegression(fit_intercept=True, C=1e-2, max_iter=10000, tol=1e-8),
+        # LogisticRegression(fit_intercept=True, C=1e2, max_iter=10000, tol=1e-8),
+        LogisticRegression(fit_intercept=True, max_iter=10000, tol=1e-8),
         Ridge(fit_intercept=False),
-        Ridge(fit_intercept=True),
+        # Ridge(fit_intercept=True),
         LinearRegression(fit_intercept=False),
         # LinearRegression(fit_intercept=True),
     ],
@@ -55,7 +57,7 @@
 )
 def test_si_aloo_approximates_loo(model, num_classes):
     if is_classifier(model):
-        X, y = make_blobs(n_samples=100, random_state=1, centers=num_classes)
+        X, y = make_blobs(n_samples=1000, random_state=1, centers=num_classes)
         if isinstance(model, RidgeClassifier):
 
             def loss_fn(model, X, y):
@@ -72,10 +74,10 @@ def loss_fn(model, X, y):
                     y, model.predict_proba(X), labels=np.arange(num_classes)
                 )
     else:
-        if num_classes - 1 > 1:
+        if num_classes > 2:
             return True
         X, y = make_regression(
-            n_samples=100,
+            n_samples=1000,
             n_features=2,
             n_informative=2,
             n_targets=num_classes - 1,
@@ -110,11 +112,43 @@ def eval(model, X, y, train, test):
     )
     loo_diff = np.asarray(loo_diff)
 
-    assert pearsonr(si_scores, loo_diff).statistic > 0.95
-    assert pearsonr(aloo_scores, loo_diff).statistic > 0.95
+    close_form = isinstance(model, (RidgeClassifier, Ridge, LinearRegression))
+
+    assert pearsonr(si_scores, loo_diff).statistic > 0.99
+    assert pearsonr(aloo_scores, loo_diff).statistic > 0.99
     assert math.isclose(
-        np.linalg.lstsq(si_scores[..., None], loo_diff)[0].item(), 1, abs_tol=0.12
+        np.linalg.lstsq(si_scores[..., None], loo_diff)[0].item(),
+        1,
+        abs_tol=0.01 if close_form else 0.25,
     )
     assert math.isclose(
-        np.linalg.lstsq(aloo_scores[..., None], loo_diff)[0].item(), 1, abs_tol=0.05
+        np.linalg.lstsq(aloo_scores[..., None], loo_diff)[0].item(),
+        1,
+        abs_tol=0.005 if close_form else 0.2,
+    )
+
+
+@pytest.mark.parametrize(
+    "model", [LogisticRegression(fit_intercept=False, penalty=None)]
+)
+@pytest.mark.parametrize("num_classes", [2])
+def test_aloo_against_statmodels(model, num_classes):
+    X, y = make_blobs(n_samples=30, random_state=1, centers=num_classes)
+
+    X = StandardScaler().fit_transform(X)
+
+    model.fit(X, y)
+
+    aloo = ApproximateLOO()
+
+    res = GLM(y, X, family=families.Binomial()).fit()
+    model.coef_ = res.params.reshape(1, -1)
+
+    aloo_scores = aloo(model, X, y)
+
+    np.testing.assert_allclose(
+        aloo_scores, -2 * res.get_influence(observed=True).cooks_distance[0]
+    )
+    np.testing.assert_allclose(
+        linearize(model, X, y)[0].hessian(X, y), -res.model.hessian(res.params)
     )