Skip to content

Commit

Permalink
Move data generation to fixture
Browse files Browse the repository at this point in the history
  • Loading branch information
FrancescMartiEscofetQC committed Jul 5, 2024
1 parent 689dd9c commit ed66a32
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 75 deletions.
30 changes: 26 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@
from git_root import git_root
from sklearn.calibration import CalibratedClassifierCV
from sklearn.discriminant_analysis import (
LinearDiscriminantAnalysis,
QuadraticDiscriminantAnalysis,
)
from sklearn.ensemble import (
AdaBoostClassifier,
AdaBoostRegressor,
BaggingClassifier,
BaggingRegressor,
Expand Down Expand Up @@ -84,7 +82,7 @@
_SIGMA_TAU = 0.5

all_sklearn_classifiers = [
AdaBoostClassifier,
# AdaBoostClassifier, # The output probabilities are wrong when there are only two classes, see https://github.com/onnx/sklearn-onnx/issues/1117
BaggingClassifier,
CalibratedClassifierCV,
DecisionTreeClassifier,
Expand All @@ -94,7 +92,7 @@
GradientBoostingClassifier,
HistGradientBoostingClassifier,
KNeighborsClassifier,
LinearDiscriminantAnalysis,
# LinearDiscriminantAnalysis, # The output probabilities are wrong when there are only two classes, see https://github.com/onnx/sklearn-onnx/issues/1116
LogisticRegression,
LogisticRegressionCV,
MLPClassifier,
Expand Down Expand Up @@ -369,3 +367,27 @@ def grid_search_data():
w_test = rng.integers(0, n_variants, n_test_samples)

return X, y_class, y_reg, w, X_test, y_test_class, y_test_reg, w_test


@pytest.fixture(scope="session")
def onnx_dataset():
rng = np.random.default_rng(_SEED)
n_samples = 300
n_numerical_features = 5

X_numerical = rng.standard_normal((n_samples, n_numerical_features))

X_with_categorical = pd.DataFrame(X_numerical)
X_with_categorical[n_numerical_features] = pd.Series(
rng.integers(10, 13, n_samples), dtype="category"
) # not start at 0
X_with_categorical[n_numerical_features + 1] = pd.Series(
rng.choice([-5, 4, -10, -32], size=n_samples), dtype="category"
) # not consecutive

y_class = rng.integers(0, 2, size=n_samples)
y_reg = rng.standard_normal(n_samples)

w = rng.integers(0, 3, n_samples)

return X_numerical, X_with_categorical, y_class, y_reg, w
29 changes: 9 additions & 20 deletions tests/test_drlearner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import numpy as np
import onnxruntime as rt
import pandas as pd
import pytest
from lightgbm import LGBMClassifier, LGBMRegressor
from onnxmltools import convert_lightgbm, convert_xgboost
Expand Down Expand Up @@ -37,7 +36,7 @@
)
@pytest.mark.parametrize("is_classification", [True, False])
def test_drlearner_onnx(
treatment_model_factory, onnx_converter, is_classification, rng
treatment_model_factory, onnx_converter, is_classification, onnx_dataset
):
supports_categoricals = treatment_model_factory in [
LGBMRegressor,
Expand All @@ -50,32 +49,22 @@ def test_drlearner_onnx(
if treatment_model_factory == RadiusNeighborsRegressor:
treatment_model_params = {"radius": 10}

# TODO: move this generation to a fixture
n_samples = 300
n_numerical_features = 5
n_variants = 3
X_numerical, X_with_categorical, y_class, y_reg, w = onnx_dataset
n_numerical_features = X_numerical.shape[1]

X = rng.standard_normal((n_samples, n_numerical_features))
if supports_categoricals:
n_categorical_features = 2
X = pd.DataFrame(X)
X[n_numerical_features] = pd.Series(
rng.integers(10, 13, n_samples), dtype="category"
) # not start at 0
X[n_numerical_features + 1] = pd.Series(
rng.choice([-5, 4, -10, -32], size=n_samples), dtype="category"
) # not consecutive
X = X_with_categorical
n_categorical_features = X.shape[1] - n_numerical_features
else:
X = X_numerical
n_categorical_features = 0

n_variants = len(np.unique(w))
if is_classification:
n_classes = 2
y = rng.integers(0, n_classes, size=n_samples)
y = y_class
nuisance_model_factory = LogisticRegression
else:
y = rng.standard_normal(n_samples)
y = y_reg
nuisance_model_factory = LinearRegression
w = rng.integers(0, n_variants, n_samples)

ml = DRLearner(
is_classification,
Expand Down
29 changes: 11 additions & 18 deletions tests/test_rlearner.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ def test_r_loss(use_pandas):
),
)
@pytest.mark.parametrize("is_classification", [True, False])
def test_rlearner_onnx(treatment_model_factory, onnx_converter, is_classification, rng):
def test_rlearner_onnx(
treatment_model_factory, onnx_converter, is_classification, onnx_dataset
):
if not function_has_argument(treatment_model_factory.fit, "sample_weight"):
pytest.skip()

Expand All @@ -67,31 +69,22 @@ def test_rlearner_onnx(treatment_model_factory, onnx_converter, is_classificatio
# XGBRegressor,
]

n_samples = 300
n_numerical_features = 5
n_variants = 3
X_numerical, X_with_categorical, y_class, y_reg, w = onnx_dataset
n_numerical_features = X_numerical.shape[1]

X = rng.standard_normal((n_samples, n_numerical_features))
if supports_categoricals:
n_categorical_features = 2
X = pd.DataFrame(X)
X[n_numerical_features] = pd.Series(
rng.integers(10, 13, n_samples), dtype="category"
) # not start at 0
X[n_numerical_features + 1] = pd.Series(
rng.choice([-5, 4, -10, -32], size=n_samples), dtype="category"
) # not consecutive
X = X_with_categorical
n_categorical_features = X.shape[1] - n_numerical_features
else:
X = X_numerical
n_categorical_features = 0

n_variants = len(np.unique(w))
if is_classification:
n_classes = 2
y = rng.integers(0, n_classes, size=n_samples)
y = y_class
nuisance_model_factory = LogisticRegression
else:
y = rng.standard_normal(n_samples)
y = y_reg
nuisance_model_factory = LinearRegression
w = rng.integers(0, n_variants, n_samples)

ml = RLearner(
is_classification,
Expand Down
39 changes: 15 additions & 24 deletions tests/test_tlearner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import numpy as np
import onnxruntime as rt
import pandas as pd
import pytest
from lightgbm import LGBMClassifier, LGBMRegressor
from onnxmltools import convert_lightgbm, convert_xgboost
Expand All @@ -15,7 +14,6 @@
HistGradientBoostingClassifier,
HistGradientBoostingRegressor,
)
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import (
RadiusNeighborsClassifier,
RadiusNeighborsRegressor,
Expand Down Expand Up @@ -55,7 +53,9 @@
]
),
)
def test_tlearner_onnx(nuisance_model_factory, onnx_converter, is_classification, rng):
def test_tlearner_onnx(
nuisance_model_factory, onnx_converter, is_classification, onnx_dataset
):
supports_categoricals = nuisance_model_factory in [
LGBMClassifier,
LGBMRegressor,
Expand All @@ -81,30 +81,21 @@ def test_tlearner_onnx(nuisance_model_factory, onnx_converter, is_classification
nuisance_model_params = {"enable_categorical": True}
else:
nuisance_model_params = None
n_samples = 300
n_numerical_features = 5
n_variants = 3
n_classes = (
2 if nuisance_model_factory == GaussianProcessClassifier else 3
) # convert_sklearn only supports binary classification with GaussianProcessClassifier
X = rng.standard_normal((n_samples, n_numerical_features))

X_numerical, X_with_categorical, y_class, y_reg, w = onnx_dataset
n_numerical_features = X_numerical.shape[1]

if supports_categoricals:
n_categorical_features = 2
X = pd.DataFrame(X)
X[n_numerical_features] = pd.Series(
rng.integers(10, 13, n_samples), dtype="category"
) # not start at 0
X[n_numerical_features + 1] = pd.Series(
rng.choice([-5, 4, -10, -32], size=n_samples), dtype="category"
) # not consecutive
X = X_with_categorical
n_categorical_features = X.shape[1] - n_numerical_features
else:
X = X_numerical
n_categorical_features = 0

n_variants = len(np.unique(w))
if is_classification:
y = rng.integers(0, n_classes, size=n_samples)
y = y_class
else:
y = rng.standard_normal(n_samples)
w = rng.integers(0, n_variants, n_samples)
y = y_reg

ml = TLearner(
is_classification,
Expand Down Expand Up @@ -140,8 +131,8 @@ def test_tlearner_onnx(nuisance_model_factory, onnx_converter, is_classification
onnx_X = X.to_numpy(np.float32)
# This is needed for categoricals as LGBM uses the categorical codes, when
# other implementations support categoricals this may need to be changed
onnx_X[:, n_numerical_features] = X[n_numerical_features].cat.codes
onnx_X[:, n_numerical_features + 1] = X[n_numerical_features + 1].cat.codes
for i in range(n_categorical_features):
onnx_X[:, n_numerical_features + i] = X[n_numerical_features + i].cat.codes
else:
onnx_X = X.astype(np.float32)

Expand Down
16 changes: 7 additions & 9 deletions tests/test_xlearner.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def test_xlearner_onnx(
treatment_onnx_converter,
propensity_onnx_converter,
is_classification,
rng,
onnx_dataset,
):
treatment_model_params: Params | None
if treatment_model_factory == RadiusNeighborsRegressor:
Expand All @@ -76,19 +76,17 @@ def test_xlearner_onnx(
else:
propensity_model_params = None

n_samples = 300
n_numerical_features = 5
n_variants = 3
X = rng.standard_normal((n_samples, n_numerical_features))
X, _, y_class, y_reg, w = onnx_dataset
n_numerical_features = X.shape[1]
n_variants = len(np.unique(w))
if is_classification:
n_classes = 2
y = rng.integers(0, n_classes, size=n_samples)
y = y_class
nuisance_model_factory = LGBMClassifier
else:
y = rng.standard_normal(n_samples)
y = y_reg
nuisance_model_factory = LGBMRegressor

nuisance_model_params = {"n_estimators": 1}
w = rng.integers(0, n_variants, n_samples)

ml = XLearner(
is_classification,
Expand Down

0 comments on commit ed66a32

Please sign in to comment.