From dd894976fc1131631219987287b6be12cc840eda Mon Sep 17 00:00:00 2001 From: Roman Bredehoft Date: Wed, 20 Sep 2023 16:37:41 +0200 Subject: [PATCH] feat: enable import of fitted linear sklearn models --- .../ml/common/serialization/decoder.py | 6 +- src/concrete/ml/pytest/utils.py | 369 +++++++++++++----- src/concrete/ml/sklearn/__init__.py | 198 +++++++--- src/concrete/ml/sklearn/base.py | 72 +++- .../test_pbs_error_probability_settings.py | 4 +- tests/common/test_serialization.py | 4 +- tests/common/test_skearn_model_lists.py | 48 +-- tests/deployment/test_client_server.py | 4 +- .../test_p_error_binary_search.py | 14 +- tests/seeding/test_seeding.py | 4 +- tests/sklearn/test_common.py | 4 +- tests/sklearn/test_dump_onnx.py | 4 +- tests/sklearn/test_pandas_errors.py | 5 +- tests/sklearn/test_sklearn_models.py | 156 +++++--- 14 files changed, 639 insertions(+), 253 deletions(-) diff --git a/src/concrete/ml/common/serialization/decoder.py b/src/concrete/ml/common/serialization/decoder.py index eebe4e25a4..ce9ab7a70e 100644 --- a/src/concrete/ml/common/serialization/decoder.py +++ b/src/concrete/ml/common/serialization/decoder.py @@ -18,7 +18,7 @@ UniformQuantizationParameters, UniformQuantizer, ) -from ...sklearn import get_sklearn_models +from ...sklearn import get_sklearn_all_models from . import SUPPORTED_TORCH_ACTIVATIONS, USE_SKOPS # If USE_SKOPS is False or Skops can't be imported, default to pickle @@ -49,7 +49,7 @@ def _get_fully_qualified_name(object_class: Type) -> str: ] _TRUSTED_CONCRETE_MODELS = [ - _get_fully_qualified_name(model_class) for model_class in get_sklearn_models()["all"] + _get_fully_qualified_name(model_class) for model_class in get_sklearn_all_models() ] # Define all the trusted types that Skops should consider @@ -181,7 +181,7 @@ def object_hook(d: Any) -> Any: # `dump_dict` and `load_dict` method) if not already done if not SERIALIZABLE_CLASSES: serializable_classes = ( - get_sklearn_models()["all"] + get_sklearn_all_models() + list(ALL_QUANTIZED_OPS) + [ QuantizedArray, diff --git a/src/concrete/ml/pytest/utils.py b/src/concrete/ml/pytest/utils.py index ce130991c3..93c04c56b1 100644 --- a/src/concrete/ml/pytest/utils.py +++ b/src/concrete/ml/pytest/utils.py @@ -12,139 +12,310 @@ from ..common.serialization.dumpers import dump, dumps from ..common.serialization.loaders import load, loads -from ..common.utils import get_model_class, get_model_name, is_model_class_in_a_list, is_pandas_type +from ..common.utils import ( + get_model_name, + is_classifier_or_partial_classifier, + is_model_class_in_a_list, + is_pandas_type, + is_regressor_or_partial_regressor, +) from ..sklearn import ( - DecisionTreeClassifier, - DecisionTreeRegressor, - ElasticNet, - GammaRegressor, - Lasso, LinearRegression, - LinearSVC, - LinearSVR, - LogisticRegression, NeuralNetClassifier, NeuralNetRegressor, - PoissonRegressor, - RandomForestClassifier, - RandomForestRegressor, - Ridge, TweedieRegressor, - XGBClassifier, - XGBRegressor, + get_sklearn_linear_models, get_sklearn_neural_net_models, + get_sklearn_tree_models, ) -_regressor_models = [ - XGBRegressor, - GammaRegressor, - LinearRegression, - Lasso, - Ridge, - ElasticNet, - LinearSVR, - PoissonRegressor, - TweedieRegressor, - partial(TweedieRegressor, link="auto", power=0.0), - partial(TweedieRegressor, link="auto", power=2.8), - partial(TweedieRegressor, link="log", power=1.0), - partial(TweedieRegressor, link="identity", power=0.0), - DecisionTreeRegressor, - RandomForestRegressor, - partial( - NeuralNetRegressor, - module__n_layers=3, - module__n_w_bits=2, - module__n_a_bits=2, - module__n_accum_bits=7, # Stay with 7 bits for test exec time - module__n_hidden_neurons_multiplier=1, - module__activation_function=nn.ReLU, - max_epochs=10, - verbose=0, - callbacks="disable", - ), -] - -_classifier_models = [ - DecisionTreeClassifier, - RandomForestClassifier, - XGBClassifier, - LinearSVC, - LogisticRegression, - partial( - NeuralNetClassifier, - module__n_layers=3, - module__activation_function=nn.ReLU, - max_epochs=10, - verbose=0, - callbacks="disable", - ), -] - -# Get the data-sets. The data generation is seeded in load_data. -_classifiers_and_datasets = [ - pytest.param( + +def get_pytest_param_regressor(model, n_targets: int): + """Get the pytest parameters to use for testing the regression model. + + The pytest parameters includes the model itself, the parameters to use for generating the + regression data-set and the test identifier (the model's name). + + Args: + model: The regression model to consider. + n_targets (int): The number of targets to consider when generating the dataset. + + Returns: + The pytest parameters to use for testing the regression model. + """ + return pytest.param( model, { - "n_samples": 1000, + "n_samples": 200, "n_features": 10, - "n_classes": n_classes, "n_informative": 10, - "n_redundant": 0, + "n_targets": n_targets, + "noise": 0, }, id=get_model_name(model), ) - for model in _classifier_models - for n_classes in [2, 4] -] - -# Get the data-sets. The data generation is seeded in load_data. -# Only LinearRegression supports multi targets -# GammaRegressor, PoissonRegressor and TweedieRegressor only handle positive target values -_regressors_and_datasets = [ - pytest.param( + + +def get_pytest_param_classifier(model, n_classes: int): + """Get the pytest parameters to use for testing the classification model. + + The pytest parameters includes the model itself, the parameters to use for generating the + classification data-set and the test identifier (the model's name). + + Args: + model: The classification model to consider. + n_classes (int): The number of classes to consider when generating the dataset. + + Returns: + The pytest parameters to use for testing the classification model. + """ + return pytest.param( model, { - "n_samples": 200, + "n_samples": 1000, "n_features": 10, + "n_classes": n_classes, "n_informative": 10, - "n_targets": 2 if model == LinearRegression else 1, - "noise": 0, + "n_redundant": 0, }, id=get_model_name(model), ) - for model in _regressor_models -] -# All scikit-learn models in Concrete ML -sklearn_models_and_datasets = _classifiers_and_datasets + _regressors_and_datasets +def _get_sklearn_models_and_datasets(model_classes: List, unique_models: bool = False) -> List: + """Get the pytest parameters to use for testing the given models. + + Args: + model_classes (List): The models to consider. + unique_models (bool): If each models should be represented only once. + + Returns: + models_and_datasets (List): The pytest parameters to use for testing the given models. + + Raises: + ValueError: If one of the given model is neither considered a regressor nor a classifier. + """ + models_and_datasets = [] + + for model_class in model_classes: + if is_regressor_or_partial_regressor(model_class): + + # We only test LinearRegression models for multiple-targets support + n_targets = 2 if model_class == LinearRegression else 1 + + models_and_datasets.append(get_pytest_param_regressor(model_class, n_targets=n_targets)) + + elif is_classifier_or_partial_classifier(model_class): + + # Unless each models should be represented only once, we test classifier models for both + # binary and multiclass classification + n_classes_to_test = [2] if unique_models else [2, 4] + + for n_classes in n_classes_to_test: + models_and_datasets.append( + get_pytest_param_classifier(model_class, n_classes=n_classes) + ) + + else: + raise ValueError( + f"Model class {model_class} is neither a regressor nor a classifier." + ) # pragma: no-cover + + return models_and_datasets + + +def get_sklearn_linear_models_and_datasets( + regressor: bool = True, + classifier: bool = True, + unique_models: bool = False, + include: Optional[Union[str, List[str]]] = None, + exclude: Optional[Union[str, List[str]]] = None, +) -> List: + """Get the pytest parameters to use for testing linear models. + + Args: + regressor (bool): If regressors should be selected. + classifier (bool): If classifiers should be selected. + unique_models (bool): If each models should be represented only once. + include (Optional[Union[str, List[str]]]): If not None, only return models which names (or + a part of it) match the given string or list of strings. Default to None. + exclude (Optional[Union[str, List[str]]]): If not None, only return models which names (or + a part of it) do not match the given string or list of strings. Default to None. + + Returns: + List: The pytest parameters to use for testing linear models. + """ + + # Get all linear model classes currently available in Concrete ML + linear_classes = get_sklearn_linear_models( + regressor=regressor, + classifier=classifier, + str_in_class_name=include, + str_not_in_class_name=exclude, + ) + + # If the TweedieRegressor has been selected and is allowed to be represented more than once, + # add a few more testing configuration + if not unique_models and is_model_class_in_a_list(TweedieRegressor, linear_classes): + linear_classes += [ + partial(TweedieRegressor, link="auto", power=0.0), + partial(TweedieRegressor, link="auto", power=2.8), + partial(TweedieRegressor, link="log", power=1.0), + partial(TweedieRegressor, link="identity", power=0.0), + ] + + return _get_sklearn_models_and_datasets(linear_classes, unique_models=unique_models) -def get_random_extract_of_sklearn_models_and_datasets(): - """Return a random sublist of sklearn_models_and_datasets. - The sublist contains exactly one model of each kind. +def get_sklearn_tree_models_and_datasets( + regressor: bool = True, + classifier: bool = True, + unique_models: bool = False, + include: Optional[Union[str, List[str]]] = None, + exclude: Optional[Union[str, List[str]]] = None, +) -> List: + """Get the pytest parameters to use for testing tree-based models. + + Args: + regressor (bool): If regressors should be selected. + classifier (bool): If classifiers should be selected. + unique_models (bool): If each models should be represented only once. + include (Optional[Union[str, List[str]]]): If not None, only return models which names (or + a part of it) match the given string or list of strings. Default to None. + exclude (Optional[Union[str, List[str]]]): If not None, only return models which names (or + a part of it) do not match the given string or list of strings. Default to None. Returns: - the sublist + List: The pytest parameters to use for testing tree-based models. + """ + # Get all tree-based model classes currently available in Concrete ML + tree_classes = get_sklearn_tree_models( + regressor=regressor, + classifier=classifier, + str_in_class_name=include, + str_not_in_class_name=exclude, + ) + + return _get_sklearn_models_and_datasets(tree_classes, unique_models=unique_models) + + +def get_sklearn_neural_net_models_and_datasets( + regressor: bool = True, + classifier: bool = True, + unique_models: bool = False, + include: Optional[Union[str, List[str]]] = None, + exclude: Optional[Union[str, List[str]]] = None, +) -> List: + """Get the pytest parameters to use for testing neural network models. + + Args: + regressor (bool): If regressors should be selected. + classifier (bool): If classifiers should be selected. + unique_models (bool): If each models should be represented only once. + include (Optional[Union[str, List[str]]]): If not None, only return models which names (or + a part of it) match the given string or list of strings. Default to None. + exclude (Optional[Union[str, List[str]]]): If not None, only return models which names (or + a part of it) do not match the given string or list of strings. Default to None. + Returns: + List: The pytest parameters to use for testing neural network models. """ - unique_model_classes = [] - done = {} - for m in sklearn_models_and_datasets: - t = m.values - typ = get_model_class(t[0]) + # Get all neural-network model classes currently available in Concrete ML + selected_neural_net_classes = get_sklearn_neural_net_models( + regressor=regressor, + classifier=classifier, + str_in_class_name=include, + str_not_in_class_name=exclude, + ) - if typ not in done: - done[typ] = True - unique_model_classes.append(m) + neural_net_classes = [] + + # If the NeuralNetRegressor has been selected, configure its initialization parameters + if is_model_class_in_a_list(NeuralNetRegressor, selected_neural_net_classes): + neural_net_classes.append( + partial( + NeuralNetRegressor, + module__n_layers=3, + module__n_w_bits=2, + module__n_a_bits=2, + module__n_accum_bits=7, # Stay with 7 bits for test exec time + module__n_hidden_neurons_multiplier=1, + module__activation_function=nn.ReLU, + max_epochs=10, + verbose=0, + callbacks="disable", + ) + ) - # To avoid to make mistakes and return empty list - assert len(sklearn_models_and_datasets) == 28 - assert len(unique_model_classes) == 18 + # If the NeuralNetClassifier has been selected, configure its initialization parameters + if is_model_class_in_a_list(NeuralNetClassifier, selected_neural_net_classes): + neural_net_classes.append( + partial( + NeuralNetClassifier, + module__n_layers=3, + module__activation_function=nn.ReLU, + max_epochs=10, + verbose=0, + callbacks="disable", + ) + ) + return _get_sklearn_models_and_datasets(neural_net_classes, unique_models=unique_models) + + +def get_sklearn_all_models_and_datasets( + regressor: bool = True, + classifier: bool = True, + unique_models: bool = False, + include: Optional[Union[str, List[str]]] = None, + exclude: Optional[Union[str, List[str]]] = None, +) -> List: + """Get the pytest parameters to use for testing all models available in Concrete ML. + + Args: + regressor (bool): If regressors should be selected. + classifier (bool): If classifiers should be selected. + unique_models (bool): If each models should be represented only once. + include (Optional[Union[str, List[str]]]): If not None, only return models which names (or + a part of it) match the given string or list of strings. Default to None. + exclude (Optional[Union[str, List[str]]]): If not None, only return models which names (or + a part of it) do not match the given string or list of strings. Default to None. - return unique_model_classes + Returns: + List: The pytest parameters to use for testing all models available in Concrete ML. + """ + return ( + get_sklearn_linear_models_and_datasets( + regressor=regressor, + classifier=classifier, + unique_models=unique_models, + include=include, + exclude=exclude, + ) + + get_sklearn_tree_models_and_datasets( + regressor=regressor, + classifier=classifier, + unique_models=unique_models, + include=include, + exclude=exclude, + ) + + get_sklearn_neural_net_models_and_datasets( + regressor=regressor, + classifier=classifier, + unique_models=unique_models, + include=include, + exclude=exclude, + ) + ) + + +# All scikit-learn models available in Concrete ML to test and their associated dataset parameters +MODELS_AND_DATASETS = get_sklearn_all_models_and_datasets(regressor=True, classifier=True) + +# All unique scikit-learn models available in Concrete ML and their associated dataset parameters +UNIQUE_MODELS_AND_DATASETS = get_sklearn_all_models_and_datasets( + regressor=True, classifier=True, unique_models=True +) def instantiate_model_generic(model_class, n_bits, **parameters): diff --git a/src/concrete/ml/sklearn/__init__.py b/src/concrete/ml/sklearn/__init__.py index 1b938ac4d7..ab6bbc8adc 100644 --- a/src/concrete/ml/sklearn/__init__.py +++ b/src/concrete/ml/sklearn/__init__.py @@ -1,8 +1,12 @@ """Import sklearn models.""" -from typing import List +from typing import Dict, List, Optional, Union from ..common.debugging.custom_assert import assert_true -from ..common.utils import is_classifier_or_partial_classifier, is_regressor_or_partial_regressor +from ..common.utils import ( + get_model_name, + is_classifier_or_partial_classifier, + is_regressor_or_partial_regressor, +) from .base import _ALL_SKLEARN_MODELS, _LINEAR_MODELS, _NEURALNET_MODELS, _TREE_MODELS from .glm import GammaRegressor, PoissonRegressor, TweedieRegressor from .linear_model import ElasticNet, Lasso, LinearRegression, LogisticRegression, Ridge @@ -13,11 +17,11 @@ from .xgb import XGBClassifier, XGBRegressor -def get_sklearn_models(): - """Return the list of available models in Concrete ML. +def get_sklearn_models() -> Dict[str, List]: + """Return the list of available scikit-learn models in Concrete ML. Returns: - the lists of models in Concrete ML + sklearn_models (Dict[str, List]): The lists of scikit-learn models available in Concrete ML. """ # Import anything in sklearn, just to force the import, to populate _ALL_SKLEARN_MODELS list @@ -26,100 +30,188 @@ def get_sklearn_models(): # We return sorted lists such that it is ordered, to avoid notably issues when it is used # in @pytest.mark.parametrize - ans = { + sklearn_models = { "all": sorted(list(_ALL_SKLEARN_MODELS), key=lambda m: m.__name__), "linear": sorted(list(_LINEAR_MODELS), key=lambda m: m.__name__), "tree": sorted(list(_TREE_MODELS), key=lambda m: m.__name__), "neural_net": sorted(list(_NEURALNET_MODELS), key=lambda m: m.__name__), } - return ans + return sklearn_models -def _filter_models(prelist, classifier: bool, regressor: bool, str_in_class_name: List[str] = None): - """Return the models which are in prelist and follow (classifier, regressor) conditions. +def _filter_models( + models, + classifier: bool, + regressor: bool, + include: Optional[Union[str, List[str]]] = None, + exclude: Optional[Union[str, List[str]]] = None, +): + """Return a list of models filtered by the given conditions, sorted by name. Args: - prelist: list of models - classifier (bool): whether you want classifiers or not - regressor (bool): whether you want regressors or not - str_in_class_name (List[str]): if not None, only return models with the given string or - list of strings as a substring in their class name + models: The list of models to consider. + classifier (bool): If classifiers should be considered. + regressor (bool): If regressors should be considered. + include (Optional[Union[str, List[str]]]): If not None, only return models which names (or + a part of it) match the given string or list of strings. Default to None. + exclude (Optional[Union[str, List[str]]]): If not None, only return models which names (or + a part of it) do not match the given string or list of strings. Default to None. Returns: - the sublist which fulfills the (classifier, regressor, str_in_class_name) conditions. + The filtered list of models available in Concrete ML, sorted by name. """ assert_true(classifier or regressor, "Please set at least one option") - answer = [] + selected_models = [] if classifier: - answer += [m for m in prelist if is_classifier_or_partial_classifier(m)] + selected_models += [model for model in models if is_classifier_or_partial_classifier(model)] if regressor: - answer += [m for m in prelist if is_regressor_or_partial_regressor(m)] + selected_models += [model for model in models if is_regressor_or_partial_regressor(model)] - if str_in_class_name is not None: - if isinstance(str_in_class_name, str): - str_in_class_name = [str_in_class_name] + if include is not None: + if isinstance(include, str): + include = [include] - for name in str_in_class_name: - answer += [m for m in answer if name in m.__name__] + # Filter the selected models: only include the ones which name matches the given string + # (or at least one of the given strings if it's a list) + selected_models = [ + model for name in include for model in selected_models if name in get_model_name(model) + ] - # We return a sorted list such that it is ordered, to avoid notably issues when it is used - # in @pytest.mark.parametrize - return sorted(answer, key=lambda m: m.__name__) + if exclude is not None: + if isinstance(exclude, str): + exclude = [exclude] + + # Filter the selected models: remove the ones which name matches the given string (or at + # least one of the given strings if it's a list) + selected_models = [ + model + for name in exclude + for model in selected_models + if name not in get_model_name(model) + ] + + # Return a sorted list in order to avoid issues when used in @pytest.mark.parametrize + return sorted(selected_models, key=lambda m: m.__name__) def get_sklearn_linear_models( - classifier: bool = True, regressor: bool = True, str_in_class_name: List[str] = None + classifier: bool = True, + regressor: bool = True, + str_in_class_name: Optional[Union[str, List[str]]] = None, + str_not_in_class_name: Optional[Union[str, List[str]]] = None, ): - """Return the list of available linear models in Concrete ML. + """Return a list of available linear models in Concrete ML. + + The list is sorted by name and can be filtered using the given conditions. Args: - classifier (bool): whether you want classifiers or not - regressor (bool): whether you want regressors or not - str_in_class_name (List[str]): if not None, only return models with the given string or - list of strings as a substring in their class name + classifier (bool): If classifiers should be considered. + regressor (bool): If regressors should be considered. + str_in_class_name (Optional[Union[str, List[str]]]): If not None, only return models which + names (or a part of it) match the given string or list of strings. Default to None. + str_not_in_class_name (Optional[Union[str, List[str]]]): If not None, only return models + which names (or a part of it) do not match the given string or list of strings. Default + to None. Returns: - the lists of linear models in Concrete ML + The filtered list of linear models available in Concrete ML, sorted by name. """ - prelist = get_sklearn_models()["linear"] - return _filter_models(prelist, classifier, regressor, str_in_class_name) + linear_models = get_sklearn_models()["linear"] + return _filter_models( + linear_models, + classifier, + regressor, + include=str_in_class_name, + exclude=str_not_in_class_name, + ) def get_sklearn_tree_models( - classifier: bool = True, regressor: bool = True, str_in_class_name: List[str] = None + classifier: bool = True, + regressor: bool = True, + str_in_class_name: Optional[Union[str, List[str]]] = None, + str_not_in_class_name: Optional[Union[str, List[str]]] = None, ): - """Return the list of available tree models in Concrete ML. + """Return the list of available tree-based models in Concrete ML. + + The list is sorted by name and can be filtered using the given conditions. Args: - classifier (bool): whether you want classifiers or not - regressor (bool): whether you want regressors or not - str_in_class_name (List[str]): if not None, only return models with the given string or - list of strings as a substring in their class name + classifier (bool): If classifiers should be considered. + regressor (bool): If regressors should be considered. + str_in_class_name (Union[str, List[str]]): If not None, only return models which names (or + a part of it) match the given string or list of strings. Default to None. + str_not_in_class_name (Union[str, List[str]]): If not None, only return models which names + (or a part of it) do not match the given string or list of strings. Default to None. Returns: - the lists of tree models in Concrete ML + The filtered list of tree-based models available in Concrete ML, sorted by name. """ - prelist = get_sklearn_models()["tree"] - return _filter_models(prelist, classifier, regressor, str_in_class_name) + tree_models = get_sklearn_models()["tree"] + return _filter_models( + tree_models, classifier, regressor, include=str_in_class_name, exclude=str_not_in_class_name + ) def get_sklearn_neural_net_models( - classifier: bool = True, regressor: bool = True, str_in_class_name: List[str] = None + classifier: bool = True, + regressor: bool = True, + str_in_class_name: Optional[Union[str, List[str]]] = None, + str_not_in_class_name: Optional[Union[str, List[str]]] = None, +): + """Return the list of available neural network models in Concrete ML. + + The list is sorted by name and can be filtered using the given conditions. + + Args: + classifier (bool): If classifiers should be considered. + regressor (bool): If regressors should be considered. + str_in_class_name (Optional[Union[str, List[str]]]): If not None, only return models which + names (or a part of it) match the given string or list of strings. Default to None. + str_not_in_class_name (Optional[Union[str, List[str]]]): If not None, only return models + which names (or a part of it) do not match the given string or list of strings. Default + to None. + + Returns: + The filtered list of neural network models available in Concrete ML, sorted by name. + """ + neural_network_models = get_sklearn_models()["neural_net"] + return _filter_models( + neural_network_models, + classifier, + regressor, + include=str_in_class_name, + exclude=str_not_in_class_name, + ) + + +def get_sklearn_all_models( + classifier: bool = True, + regressor: bool = True, + str_in_class_name: Optional[Union[str, List[str]]] = None, + str_not_in_class_name: Optional[Union[str, List[str]]] = None, ): - """Return the list of available neural net models in Concrete ML. + """Return the list of all available models in Concrete ML. + + The list is sorted by name and can be filtered using the given conditions. Args: - classifier (bool): whether you want classifiers or not - regressor (bool): whether you want regressors or not - str_in_class_name (List[str]): if not None, only return models with the given string or - list of strings as a substring in their class name + classifier (bool): If classifiers should be considered. + regressor (bool): If regressors should be considered. + str_in_class_name (Optional[Union[str, List[str]]]): If not None, only return models which + names (or a part of it) match the given string or list of strings. Default to None. + str_not_in_class_name (Optional[Union[str, List[str]]]): If not None, only return models + which names (or a part of it) do not match the given string or list of strings. Default + to None. Returns: - the lists of neural net models in Concrete ML + The filtered list of all models available in Concrete ML, sorted by name. """ - prelist = get_sklearn_models()["neural_net"] - return _filter_models(prelist, classifier, regressor, str_in_class_name) + all_models = get_sklearn_models()["all"] + return _filter_models( + all_models, classifier, regressor, include=str_in_class_name, exclude=str_not_in_class_name + ) diff --git a/src/concrete/ml/sklearn/base.py b/src/concrete/ml/sklearn/base.py index 0ddbafd71d..707248f8a2 100644 --- a/src/concrete/ml/sklearn/base.py +++ b/src/concrete/ml/sklearn/base.py @@ -25,6 +25,7 @@ from concrete.fhe.compilation.configuration import Configuration from concrete.fhe.dtypes.integer import Integer from sklearn.base import clone +from sklearn.utils.validation import check_is_fitted from ..common.check_inputs import check_array_and_assert, check_X_y_and_assert_multi_output from ..common.debugging.custom_assert import assert_true @@ -162,6 +163,9 @@ def __getattr__(self, attr: str): # If the attribute ends with a single underscore and can be found in the underlying # scikit-learn model (once fitted), retrieve its value + # Enable non-training attributes as well once Concrete ML models initialize their + # underlying scikit-learn models during initialization + # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3373 if ( attr.endswith("_") and not attr.endswith("__") @@ -496,7 +500,7 @@ def compile( Args: X (Data): A representative set of input values used for building cryptographic parameters, as a Numpy array, Torch tensor, Pandas DataFrame or List. This is - usually the training data-set or s sub-set of it. + usually the training data-set or a sub-set of it. configuration (Optional[Configuration]): Options to use for compilation. Default to None. artifacts (Optional[DebugArtifacts]): Artifacts information about the compilation @@ -699,6 +703,8 @@ class BaseClassifier(BaseEstimator): the predicted values as well as handling a mapping of classes in case they are not ordered. """ + # Remove in our next release major release + # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3994 @property def target_classes_(self) -> Optional[numpy.ndarray]: # pragma: no cover """Get the model's classes. @@ -717,6 +723,8 @@ def target_classes_(self) -> Optional[numpy.ndarray]: # pragma: no cover return self.classes_ + # Remove in our next release major release + # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3994 @property def n_classes_(self) -> int: # pragma: no cover """Get the model's number of classes. @@ -1482,6 +1490,54 @@ def __init__(self, n_bits: Union[int, Dict[str, int]] = 8): BaseEstimator.__init__(self) + @classmethod + def from_sklearn_model( + cls, + sklearn_model: sklearn.base.BaseEstimator, + X: Data, + n_bits: Union[int, Dict[str, int]] = 8, + ): + """Build a FHE-compliant model using a fitted scikit-learn model. + + Args: + sklearn_model (sklearn.base.BaseEstimator): The fitted scikit-learn model to convert. + X (Data): A representative set of input values used for computing quantization + parameters, as a Numpy array, Torch tensor, Pandas DataFrame or List. This is + usually the training data-set or a sub-set of it. + n_bits (int, Dict[str, int]): Number of bits to quantize the model. If an int is passed + for n_bits, the value will be used for quantizing inputs and weights. If a dict is + passed, then it should contain "op_inputs" and "op_weights" as keys with + corresponding number of quantization bits so that: + - op_inputs : number of bits to quantize the input values + - op_weights: number of bits to quantize the learned parameters + Default to 8. + + Returns: + The FHE-compliant fitted model. + """ + + # Check that sklearn_model is a proper fitted scikit-learn model + check_is_fitted(sklearn_model) + + # Extract scikit-learn's initialization parameters + init_params = sklearn_model.get_params() + + # Instantiate the Concrete ML model and update initialization parameters + # This update is necessary as we currently store scikit-learn attributes in Concrete ML + # classes during initialization (for example: link or power attributes in GLMs) + # Without it, these attributes will have default values instead of the ones used by the + # scikit-learn models + # This should be fixed once Concrete ML models initialize their underlying scikit-learn + # models during initialization + # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3373 + model = cls(n_bits=n_bits, **init_params) + + # Update the underlying scikit-learn model with the given fitted one + model.sklearn_model = sklearn_model + + # Compute the quantization parameters + return model._quantize_model(X) + def _set_onnx_model(self, test_input: numpy.ndarray) -> None: """Retrieve the model's ONNX graph using Hummingbird conversion. @@ -1519,6 +1575,20 @@ def fit(self, X: Data, y: Target, **fit_parameters): # Fit the scikit-learn model self._fit_sklearn_model(X, y, **fit_parameters) + # Compute the quantization parameters + return self._quantize_model(X) + + def _quantize_model(self, X): + """Compute quantization parameters. + + Args: + X (Data): A representative set of input values used for computing quantization + parameters, as a Numpy array, Torch tensor, Pandas DataFrame or List. This is + usually the training data-set or a sub-set of it. + + Returns: + The FHE-compliant fitted model. + """ # Check that the underlying sklearn model has been set and fit assert self.sklearn_model is not None, self._sklearn_model_is_not_fitted_error_message() diff --git a/tests/common/test_pbs_error_probability_settings.py b/tests/common/test_pbs_error_probability_settings.py index 31aad3aea9..1eb8748008 100644 --- a/tests/common/test_pbs_error_probability_settings.py +++ b/tests/common/test_pbs_error_probability_settings.py @@ -8,13 +8,13 @@ from torch import nn from concrete.ml.pytest.torch_models import FCSmall -from concrete.ml.pytest.utils import sklearn_models_and_datasets +from concrete.ml.pytest.utils import MODELS_AND_DATASETS from concrete.ml.torch.compile import compile_torch_model INPUT_OUTPUT_FEATURE = [5, 10] -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS) @pytest.mark.parametrize( "kwargs", [ diff --git a/tests/common/test_serialization.py b/tests/common/test_serialization.py index d42e8cbf78..9607f4c57a 100644 --- a/tests/common/test_serialization.py +++ b/tests/common/test_serialization.py @@ -34,8 +34,8 @@ from concrete.ml.quantization import QuantizedModule from concrete.ml.sklearn import ( LinearRegression, + get_sklearn_all_models, get_sklearn_linear_models, - get_sklearn_models, get_sklearn_tree_models, ) @@ -217,7 +217,7 @@ def test_serialize_numpy_array(dtype): # Test the most important types @pytest.mark.parametrize( "value", - SUPPORTED_TORCH_ACTIVATIONS + get_sklearn_models()["all"] + [QuantizedModule], + SUPPORTED_TORCH_ACTIVATIONS + get_sklearn_all_models() + [QuantizedModule], ) def test_serialize_type(value): """Test serialization of type objects (trusted by Skops).""" diff --git a/tests/common/test_skearn_model_lists.py b/tests/common/test_skearn_model_lists.py index cd7fe34a23..fd9c967ce6 100644 --- a/tests/common/test_skearn_model_lists.py +++ b/tests/common/test_skearn_model_lists.py @@ -1,5 +1,10 @@ """Tests lists of models in Concrete ML.""" -from concrete.ml.sklearn import get_sklearn_models +from concrete.ml.sklearn import ( + get_sklearn_all_models, + get_sklearn_linear_models, + get_sklearn_neural_net_models, + get_sklearn_tree_models, +) from concrete.ml.sklearn.glm import GammaRegressor, PoissonRegressor, TweedieRegressor from concrete.ml.sklearn.linear_model import ( ElasticNet, @@ -17,35 +22,34 @@ def test_get_sklearn_models(): """List all available models in Concrete ML.""" - dic = get_sklearn_models() - cml_list = dic["all"] - linear_list = dic["linear"] - tree_list = dic["tree"] - neuralnet_list = dic["neural_net"] + all_models = get_sklearn_all_models() + linear_models = get_sklearn_linear_models() + tree_models = get_sklearn_tree_models() + neural_network_models = get_sklearn_neural_net_models() print("All models: ") - for m in cml_list: + for m in all_models: print(f" {m}") print("Linear models: ") - for m in linear_list: + for m in linear_models: print(f" {m}") print("Tree models: ") - for m in tree_list: + for m in tree_models: print(f" {m}") print("Neural net models: ") - for m in neuralnet_list: + for m in neural_network_models: print(f" {m}") # Check values - expected_neuralnet_list = [NeuralNetClassifier, NeuralNetRegressor] + expected_neural_network_models = [NeuralNetClassifier, NeuralNetRegressor] assert ( - neuralnet_list == expected_neuralnet_list - ), "Please change the expected number of models if you add new models" + neural_network_models == expected_neural_network_models + ), "Please change the expected number of models if new models have been added" - expected_tree_list = [ + expected_tree_models = [ DecisionTreeClassifier, DecisionTreeRegressor, RandomForestClassifier, @@ -54,10 +58,10 @@ def test_get_sklearn_models(): XGBRegressor, ] assert ( - tree_list == expected_tree_list - ), "Please change the expected number of models if you add new models" + tree_models == expected_tree_models + ), "Please change the expected number of models if new models have been added" - expected_linear_list = [ + expected_linear_models = [ ElasticNet, GammaRegressor, Lasso, @@ -70,11 +74,11 @@ def test_get_sklearn_models(): TweedieRegressor, ] assert ( - linear_list == expected_linear_list - ), "Please change the expected number of models if you add new models" + linear_models == expected_linear_models + ), "Please change the expected number of models if new models have been added" # Check number - assert cml_list == sorted( - expected_linear_list + expected_neuralnet_list + expected_tree_list, + assert all_models == sorted( + expected_linear_models + expected_neural_network_models + expected_tree_models, key=lambda m: m.__name__, - ), "Please change the expected number of models if you add new models" + ), "Please change the expected number of models if new models have been added" diff --git a/tests/deployment/test_client_server.py b/tests/deployment/test_client_server.py index 783cd07ab3..b3a030ddcc 100644 --- a/tests/deployment/test_client_server.py +++ b/tests/deployment/test_client_server.py @@ -14,7 +14,7 @@ from concrete.ml.deployment.fhe_client_server import FHEModelClient, FHEModelDev, FHEModelServer from concrete.ml.pytest.torch_models import FCSmall -from concrete.ml.pytest.utils import instantiate_model_generic, sklearn_models_and_datasets +from concrete.ml.pytest.utils import MODELS_AND_DATASETS, instantiate_model_generic from concrete.ml.quantization.quantized_module import QuantizedModule from concrete.ml.torch.compile import compile_torch_model @@ -66,7 +66,7 @@ def cleanup(self): self.dev_dir.cleanup() -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS) @pytest.mark.parametrize("n_bits", [3]) def test_client_server_sklearn( default_configuration, diff --git a/tests/parameter_search/test_p_error_binary_search.py b/tests/parameter_search/test_p_error_binary_search.py index fd301b25e9..2be5ddc1ee 100644 --- a/tests/parameter_search/test_p_error_binary_search.py +++ b/tests/parameter_search/test_p_error_binary_search.py @@ -20,8 +20,8 @@ ) from concrete.ml.pytest.torch_models import QuantCustomModel, TorchCustomModel from concrete.ml.pytest.utils import ( + UNIQUE_MODELS_AND_DATASETS, data_calibration_processing, - get_random_extract_of_sklearn_models_and_datasets, instantiate_model_generic, load_torch_model, ) @@ -135,9 +135,7 @@ def test_update_valid_attr_method(attr, value, model_name, quant_type, metric, l assert getattr(search, attr) == value -@pytest.mark.parametrize( - "model_class, parameters", get_random_extract_of_sklearn_models_and_datasets() -) +@pytest.mark.parametrize("model_class, parameters", UNIQUE_MODELS_AND_DATASETS) def test_non_convergence_for_built_in_models(model_class, parameters, load_data, is_weekly_option): """Check that binary search raises a user warning when convergence is not achieved. @@ -293,9 +291,7 @@ def test_binary_search_for_custom_models(model_name, quant_type, threshold): @pytest.mark.parametrize("threshold", [0.02]) -@pytest.mark.parametrize( - "model_class, parameters", get_random_extract_of_sklearn_models_and_datasets() -) +@pytest.mark.parametrize("model_class, parameters", UNIQUE_MODELS_AND_DATASETS) @pytest.mark.parametrize("predict", ["predict", "predict_proba"]) def test_binary_search_for_built_in_models(model_class, parameters, threshold, predict, load_data): """Check if the returned `p_error` is valid for built-in models.""" @@ -382,9 +378,7 @@ def test_invalid_estimator_for_custom_models(is_qat, load_data): search.run(x=x_calib, ground_truth=y, strategy=all, max_iter=1, n_simulation=1) -@pytest.mark.parametrize( - "model_class, parameters", get_random_extract_of_sklearn_models_and_datasets() -) +@pytest.mark.parametrize("model_class, parameters", UNIQUE_MODELS_AND_DATASETS) def test_invalid_estimator_for_built_in_models(model_class, parameters, load_data): """Check that binary search raises an exception for unsupported models.""" diff --git a/tests/seeding/test_seeding.py b/tests/seeding/test_seeding.py index 27af16c06e..57fd4ad52a 100644 --- a/tests/seeding/test_seeding.py +++ b/tests/seeding/test_seeding.py @@ -8,7 +8,7 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.tree import plot_tree -from concrete.ml.pytest.utils import sklearn_models_and_datasets +from concrete.ml.pytest.utils import MODELS_AND_DATASETS def test_seed_1(): @@ -79,7 +79,7 @@ def test_seed_needing_randomly_seed_arg_3(random_inputs_1, random_inputs_2, rand print("Random inputs", random_inputs_3) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS) def test_seed_sklearn(model_class, parameters, load_data, default_configuration): """Test seeding of sklearn models""" diff --git a/tests/sklearn/test_common.py b/tests/sklearn/test_common.py index 3ce9dcede1..dd22fb3354 100644 --- a/tests/sklearn/test_common.py +++ b/tests/sklearn/test_common.py @@ -7,7 +7,7 @@ from sklearn.exceptions import ConvergenceWarning from concrete.ml.common.utils import get_model_class -from concrete.ml.pytest.utils import sklearn_models_and_datasets +from concrete.ml.pytest.utils import MODELS_AND_DATASETS from concrete.ml.sklearn import ( get_sklearn_linear_models, get_sklearn_neural_net_models, @@ -35,7 +35,7 @@ def test_sklearn_args(): assert test_counter == 18 -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS) def test_seed_sklearn(model_class, parameters, load_data): """Tests the random_state parameter.""" x, y = load_data(model_class, **parameters) diff --git a/tests/sklearn/test_dump_onnx.py b/tests/sklearn/test_dump_onnx.py index 00b22c4a94..40f87c6eb1 100644 --- a/tests/sklearn/test_dump_onnx.py +++ b/tests/sklearn/test_dump_onnx.py @@ -10,7 +10,7 @@ from sklearn.exceptions import ConvergenceWarning from concrete.ml.common.utils import is_model_class_in_a_list -from concrete.ml.pytest.utils import get_model_name, sklearn_models_and_datasets +from concrete.ml.pytest.utils import MODELS_AND_DATASETS, get_model_name from concrete.ml.sklearn import get_sklearn_tree_models from concrete.ml.sklearn.qnn import NeuralNetClassifier, NeuralNetRegressor @@ -78,7 +78,7 @@ def check_onnx_file_dump(model_class, parameters, load_data, str_expected, defau assert str_model in str_expected -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS) def test_dump( model_class, parameters, diff --git a/tests/sklearn/test_pandas_errors.py b/tests/sklearn/test_pandas_errors.py index 717f19ee4f..bd75db3fed 100644 --- a/tests/sklearn/test_pandas_errors.py +++ b/tests/sklearn/test_pandas_errors.py @@ -6,11 +6,10 @@ import pytest from concrete.ml.common.utils import is_model_class_in_a_list -from concrete.ml.pytest.utils import sklearn_models_and_datasets -from concrete.ml.sklearn import get_sklearn_neural_net_models +from concrete.ml.sklearn import get_sklearn_all_models, get_sklearn_neural_net_models -@pytest.mark.parametrize("model_class", [m[0][0] for m in sklearn_models_and_datasets]) +@pytest.mark.parametrize("model_class", get_sklearn_all_models()) @pytest.mark.parametrize( "bad_value, expected_error", [ diff --git a/tests/sklearn/test_sklearn_models.py b/tests/sklearn/test_sklearn_models.py index 13c33e12c8..b24904665c 100644 --- a/tests/sklearn/test_sklearn_models.py +++ b/tests/sklearn/test_sklearn_models.py @@ -52,15 +52,20 @@ from concrete.ml.common.serialization.loaders import load, loads from concrete.ml.common.utils import ( USE_OLD_VL, + get_model_class, get_model_name, is_classifier_or_partial_classifier, is_model_class_in_a_list, is_regressor_or_partial_regressor, ) from concrete.ml.pytest.utils import ( - _classifiers_and_datasets, + MODELS_AND_DATASETS, + UNIQUE_MODELS_AND_DATASETS, + get_sklearn_all_models_and_datasets, + get_sklearn_linear_models_and_datasets, + get_sklearn_neural_net_models_and_datasets, + get_sklearn_tree_models_and_datasets, instantiate_model_generic, - sklearn_models_and_datasets, ) from concrete.ml.sklearn import ( get_sklearn_linear_models, @@ -430,11 +435,6 @@ def check_offset(model_class, n_bits, x, y): """Check offset.""" model = instantiate_model_generic(model_class, n_bits=n_bits) - # Offsets are not supported by XGBoost - if is_model_class_in_a_list(model_class, get_sklearn_tree_models(str_in_class_name="XGB")): - # No pytest.skip, since it is not a bug but something which is inherent to XGB - return - # Sometimes, we miss convergence, which is not a problem for our test with warnings.catch_warnings(): warnings.simplefilter("ignore", category=ConvergenceWarning) @@ -585,15 +585,6 @@ def cast_input(x, y, input_type): def check_pipeline(model_class, x, y): """Check pipeline support.""" - - # Pipeline test sometimes fails with RandomForest models. This bug may come from Hummingbird - # and needs further investigations - # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/2779 - if is_model_class_in_a_list( - model_class, get_sklearn_tree_models(str_in_class_name="RandomForest") - ): - pytest.skip("Skipping pipeline test for RF, doesn't work for now") - hyper_param_combinations = get_hyper_param_combinations(model_class) # Prepare the list of all hyper parameters @@ -674,14 +665,6 @@ def check_sklearn_equivalence(model_class, n_bits, x, y, check_accuracy, check_r the scikit-learn model.""" model = instantiate_model_generic(model_class, n_bits=n_bits) - # The `fit_benchmark` function of QNNs returns a QAT model and a FP32 model that is similar - # in structure but trained from scratch. Furthermore, the `n_bits` setting - # of the QNN instantiation in `instantiate_model_generic` takes `n_bits` as - # a target accumulator and sets 3-b w&a for these tests. Thus it's - # impossible to reach R-2 of 0.99 when comparing the two NN models returned by `fit_benchmark` - if is_model_class_in_a_list(model_class, get_sklearn_neural_net_models()): - pytest.skip("Skipping sklearn-equivalence test for NN, doesn't work for now") - # Sometimes, we miss convergence, which is not a problem for our test with warnings.catch_warnings(): warnings.simplefilter("ignore", category=ConvergenceWarning) @@ -1036,7 +1019,47 @@ def check_mono_parameter_warnings(model, x, default_configuration): model.compile(x, default_configuration) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +def check_load_fitted_sklearn_linear_models(model_class, n_bits, x, y): + """Check that linear models and QNNs support loading from pre-trained scikit-learn models.""" + + model = instantiate_model_generic(model_class, n_bits=n_bits) + + # Fit the model and retrieve both the Concrete ML and the scikit-learn models + with warnings.catch_warnings(): + # Sometimes, we miss convergence, which is not a problem for our test + warnings.simplefilter("ignore", category=ConvergenceWarning) + concrete_model, sklearn_model = model.fit_benchmark(x, y) + + # This step is needed in order to handle partial classes + model_class = get_model_class(model_class) + + # Load a Concrete ML model from the fitted scikit-learn one + loaded_concrete_model = model_class.from_sklearn_model(sklearn_model, X=x, n_bits=n_bits) + + # Compile both the initial Concrete ML model and the loaded one + concrete_model.compile(x) + loaded_concrete_model.compile(x) + + # Compute and compare the predictions from both models + y_pred_simulate = concrete_model.predict(x, fhe="simulate") + y_pred_simulate_loaded = loaded_concrete_model.predict(x, fhe="simulate") + + assert numpy.isclose(y_pred_simulate, y_pred_simulate_loaded).all(), ( + "Simulated predictions from the initial model do not match the ones made from the " + "loaded one." + ) + + +# Neural network models are skipped for this test +# The `fit_benchmark` function of QNNs returns a QAT model and a FP32 model that is similar +# in structure but trained from scratch. Furthermore, the `n_bits` setting +# of the QNN instantiation in `instantiate_model_generic` takes `n_bits` as +# a target accumulator and sets 3-b w&a for these tests. Thus it's +# impossible to reach R-2 of 0.99 when comparing the two NN models returned by `fit_benchmark` +@pytest.mark.parametrize( + "model_class, parameters", + get_sklearn_linear_models_and_datasets() + get_sklearn_tree_models_and_datasets(), +) @pytest.mark.parametrize( "n_bits", [ @@ -1067,7 +1090,7 @@ def test_quantization( # This test is a known flaky # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3661 @pytest.mark.flaky -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS) @pytest.mark.parametrize( "n_bits", [ @@ -1104,7 +1127,7 @@ def test_correctness_with_sklearn( ) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS) @pytest.mark.parametrize( "n_bits", N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS, @@ -1138,7 +1161,7 @@ def test_hyper_parameters( ) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS) @pytest.mark.parametrize("n_bits", [3]) # The complete list of built-in scoring functions can be found in scikit-learn's documentation: # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter @@ -1203,7 +1226,7 @@ def test_grid_search( check_grid_search(model_class, x, y, scoring) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS) @pytest.mark.parametrize("use_dump_method", [True, False]) def test_serialization( model_class, @@ -1230,7 +1253,7 @@ def test_serialization( check_serialization(model, x, use_dump_method) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", UNIQUE_MODELS_AND_DATASETS) @pytest.mark.parametrize( "n_bits", N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS, @@ -1262,7 +1285,11 @@ def test_double_fit( check_double_fit(model_class, n_bits, x_1, x_2, y_1, y_2) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +# Offsets are not supported by XGBoost models +@pytest.mark.parametrize( + "model_class, parameters", + get_sklearn_all_models_and_datasets(exclude="XGB", unique_models=True), +) @pytest.mark.parametrize( "n_bits", N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS, @@ -1284,7 +1311,7 @@ def test_offset( check_offset(model_class, n_bits, x, y) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", UNIQUE_MODELS_AND_DATASETS) @pytest.mark.parametrize( "n_bits", N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS, @@ -1309,7 +1336,7 @@ def test_input_support( check_input_support(model_class, n_bits, default_configuration, x, y, input_type) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", UNIQUE_MODELS_AND_DATASETS) @pytest.mark.parametrize( "n_bits", N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS, @@ -1331,7 +1358,12 @@ def test_subfunctions( check_subfunctions(model, model_class, x) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +# Pipeline test sometimes fails with RandomForest models. This bug may come from Hummingbird +# and needs further investigations +# FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/2779 +@pytest.mark.parametrize( + "model_class, parameters", get_sklearn_all_models_and_datasets(exclude="RandomForest") +) @pytest.mark.parametrize( "n_bits", N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS, @@ -1353,7 +1385,7 @@ def test_pipeline( check_pipeline(model_class, x, y) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS) @pytest.mark.parametrize( "simulate", [ @@ -1468,7 +1500,7 @@ def test_predict_correctness( assert numpy.array_equal(y_pred_fhe, y_pred) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", UNIQUE_MODELS_AND_DATASETS) def test_fitted_compiled_error_raises( model_class, parameters, @@ -1487,7 +1519,7 @@ def test_fitted_compiled_error_raises( check_fitted_compiled_error_raises(model_class, n_bits, x, y) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS) @pytest.mark.parametrize( "error_param", [{"p_error": 0.9999999999990905}], # 1 - 2**-40 @@ -1558,7 +1590,10 @@ def check_for_divergent_predictions(x, model, fhe, max_iterations=N_ALLOWED_FHE_ ) -@pytest.mark.parametrize("model_class, parameters", _classifiers_and_datasets) +# This test is only relevant for classifier models +@pytest.mark.parametrize( + "model_class, parameters", get_sklearn_all_models_and_datasets(regressor=False, classifier=True) +) def test_class_mapping( model_class, parameters, @@ -1579,7 +1614,7 @@ def test_class_mapping( check_class_mapping(model, x, y) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize("model_class, parameters", UNIQUE_MODELS_AND_DATASETS) def test_exposition_of_sklearn_attributes( model_class, parameters, @@ -1600,7 +1635,9 @@ def test_exposition_of_sklearn_attributes( check_exposition_of_sklearn_attributes(model, x, y) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +@pytest.mark.parametrize( + "model_class, parameters", get_sklearn_tree_models_and_datasets(include="DecisionTree") +) def test_exposition_structural_methods_decision_trees( model_class, parameters, @@ -1609,9 +1646,6 @@ def test_exposition_structural_methods_decision_trees( verbose=True, ): """Test the exposition of specific structural methods found in decision tree models.""" - if get_model_name(model_class) not in ["DecisionTreeClassifier", "DecisionTreeRegressor"]: - return - n_bits = min(N_BITS_REGULAR_BUILDS) x, y = get_dataset(model_class, parameters, n_bits, load_data, is_weekly_option) @@ -1624,7 +1658,14 @@ def test_exposition_structural_methods_decision_trees( check_exposition_structural_methods_decision_trees(model, x, y) -@pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) +# Enable linear models as well once Concrete Python fixes the multi-parameter bug with +# fully-leveled circuits +# TODO: https://github.com/zama-ai/concrete-ml-internal/issues/3862 +@pytest.mark.parametrize( + "model_class, parameters", + get_sklearn_tree_models_and_datasets(unique_models=True) + + get_sklearn_neural_net_models_and_datasets(unique_models=True), +) def test_mono_parameter_warnings( model_class, parameters, @@ -1635,12 +1676,6 @@ def test_mono_parameter_warnings( ): """Test that setting voluntarily a mono-parameter strategy properly raises a warning.""" - # Remove this once Concrete Python fixes the multi-parameter bug with fully-leveled circuits - # TODO: https://github.com/zama-ai/concrete-ml-internal/issues/3862 - # Linear models are manually forced to use mono-parameter - if is_model_class_in_a_list(model_class, get_sklearn_linear_models()): - return - n_bits = min(N_BITS_REGULAR_BUILDS) model, x = preamble(model_class, parameters, n_bits, load_data, is_weekly_option) @@ -1649,3 +1684,24 @@ def test_mono_parameter_warnings( print("Run check_mono_parameter_warnings") check_mono_parameter_warnings(model, x, default_configuration) + + +# Importing fitted models only works with linear models +@pytest.mark.parametrize("model_class, parameters", get_sklearn_linear_models_and_datasets()) +def test_load_fitted_sklearn_linear_models( + model_class, + parameters, + load_data, + is_weekly_option, + verbose=True, +): + """Test that linear models support loading from fitted scikit-learn models.""" + + n_bits = min(N_BITS_REGULAR_BUILDS) + + x, y = get_dataset(model_class, parameters, n_bits, load_data, is_weekly_option) + + if verbose: + print("Run check_load_pre_trained_sklearn_models") + + check_load_fitted_sklearn_linear_models(model_class, n_bits, x, y)