diff --git a/src/concrete/ml/sklearn/base.py b/src/concrete/ml/sklearn/base.py index bf0090f2e..0184615a7 100644 --- a/src/concrete/ml/sklearn/base.py +++ b/src/concrete/ml/sklearn/base.py @@ -1719,14 +1719,14 @@ def __init__(self, n_bits: int = 3): quantizing inputs and X_fit. Default to 3. """ self.n_bits: int = n_bits - # _q_X_fit: In distance metric algorithms, `_q_X_fit` stores the training set to compute + # _q_fit_X: In distance metric algorithms, `_q_fit_X` stores the training set to compute # the similarity or distance measures. There is no `weights` attribute because there isn't # a training phase - self._q_X_fit: numpy.ndarray - # _y: Labels of `_q_X_fit` + self._q_fit_X: numpy.ndarray + # _y: Labels of `_q_fit_X` self._y: numpy.ndarray - # _q_X_fit_quantizer: The quantizer to use for quantizing the model's training set - self._q_X_fit_quantizer: Optional[UniformQuantizer] = None + # _q_fit_X_quantizer: The quantizer to use for quantizing the model's training set + self._q_fit_X_quantizer: Optional[UniformQuantizer] = None BaseEstimator.__init__(self) @@ -1768,8 +1768,6 @@ def fit(self, X: Data, y: Target, **fit_parameters): # KNeighbors handles multi-labels data X, y = check_X_y_and_assert_multi_output(X, y) - self._y = numpy.array(y) - # Fit the scikit-learn model self._fit_sklearn_model(X, y, **fit_parameters) @@ -1785,28 +1783,30 @@ def fit(self, X: Data, y: Target, **fit_parameters): input_quantizer = q_inputs.quantizer self.input_quantizers.append(input_quantizer) - # Quantize the _X_fit and store the associated quantizer + # Quantize the _fit_X and store the associated quantizer # pylint: disable-next=protected-access - _X_fit = self.sklearn_model._fit_X - # We assume that the inputs have the same distribution as the _X_fit - q_X_fit = QuantizedArray( + _fit_X = self.sklearn_model._fit_X + # We assume that the inputs have the same distribution as the _fit_X + q_fit_X = QuantizedArray( n_bits=self.n_bits, - values=numpy.expand_dims(_X_fit, axis=1) if len(_X_fit.shape) == 1 else _X_fit, + values=numpy.expand_dims(_fit_X, axis=1) if len(_fit_X.shape) == 1 else _fit_X, options=input_options, ) - self._q_X_fit = q_X_fit.qvalues - self._q_X_fit_quantizer = q_X_fit.quantizer + self._q_fit_X = q_fit_X.qvalues + self._q_fit_X_quantizer = q_fit_X.quantizer # mypy - assert self._q_X_fit_quantizer.scale is not None + assert self._q_fit_X_quantizer.scale is not None + + self._y = numpy.array(y) # We assume that the query has the same distribution as the data in _X_fit. # therefore, they use the same scaling and zero point. # https://arxiv.org/abs/1712.05877 self.output_quant_params = UniformQuantizationParameters( - scale=self._q_X_fit_quantizer.scale, - zero_point=self._q_X_fit_quantizer.zero_point, + scale=self._q_fit_X_quantizer.scale, + zero_point=self._q_fit_X_quantizer.zero_point, offset=0, ) @@ -1879,15 +1879,15 @@ def _inference(self, q_X: numpy.ndarray) -> numpy.ndarray: Returns: numpy.ndarray: The quantized predicted values. """ - assert self._q_X_fit_quantizer is not None, self._is_not_fitted_error_message() + assert self._q_fit_X_quantizer is not None, self._is_not_fitted_error_message() def pairwise_euclidean_distance(q_X): # 1. Pairwise euclidean distance # dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) return ( numpy.sum(q_X**2, axis=1, keepdims=True) - - 2 * q_X @ self._q_X_fit.T - + numpy.expand_dims(numpy.sum(self._q_X_fit**2, axis=1), 0) + - 2 * q_X @ self._q_fit_X.T + + numpy.expand_dims(numpy.sum(self._q_fit_X**2, axis=1), 0) ) def topk_sorting(x, labels): @@ -1896,7 +1896,8 @@ def topk_sorting(x, labels): Time complexity: O(nlogĀ²(k)) Args: - x (numpy.ndarray): The quantized input values. + x (numpy.ndarray): The quantized input values + labels (numpy.ndarray): The labels of the training data-set Returns: numpy.ndarray: The argsort. @@ -1982,10 +1983,10 @@ def scatter1d(x, v, indices): x = scatter1d(x, max_x, range_i + d) # Max index selection - sign = diff <= 0 + is_a_greater_than_b = diff <= 0 # Update labels array according to the max items - max_labels = labels_a + (labels_b - labels_a) * sign + max_labels = labels_a + (labels_b - labels_a) * is_a_greater_than_b labels = scatter1d(labels, labels_a + labels_b - max_labels, range_i) labels = scatter1d(labels, max_labels, range_i + d) @@ -2002,8 +2003,6 @@ def scatter1d(x, v, indices): return fhe_array(topk_labels) # 1. Pairwise_euclidiean distance - # from concrete import fhe - # with fhe.tag(f"distance_matrix"): distance_matrix = pairwise_euclidean_distance(q_X) # The square root in the Euclidean distance calculation is not applied to speed up FHE @@ -2011,10 +2010,6 @@ def scatter1d(x, v, indices): # Being a monotonic function, it does not affect the logic of the calculation, notably for # the argsort. - # 2. Sorting args - # with fhe.tag(f"sorted_args"): - - # pylint: disable-next=protected-access topk_labels = topk_sorting(distance_matrix.flatten(), self._y) return numpy.expand_dims(topk_labels, axis=0) @@ -2038,17 +2033,34 @@ def compile(self, *args, **kwargs) -> Circuit: return BaseEstimator.compile(self, *args, **kwargs) + def post_processing(self, y_preds: numpy.ndarray) -> numpy.ndarray: + """Perform the majority. + + For KNN, the de-quantization step is not required. Because _inference returns the label of + the k-nearest neighbors. + + Args: + y_preds (numpy.ndarray): The topk nearest labels + + Returns: + numpy.ndarray: The majority vote. + """ + y_preds_processed = [] + for y in y_preds: + vote = self.majority_vote(y.flatten()) + y_preds_processed.append(vote) + + return numpy.array(y_preds_processed) + def predict(self, X: Data, fhe: Union[FheMode, str] = FheMode.DISABLE) -> numpy.ndarray: X = check_array_and_assert(X) - y_preds = [] + topk_labels = [] for query in X: - # Argsort - topk_labels = super().predict(query[None], fhe) - # Majority vote - y_pred = self.majority_vote(topk_labels.flatten()) - y_preds.append(y_pred) + topk_labels.append(super().predict(query[None], fhe)) + + y_preds = self.post_processing(numpy.array(topk_labels)) return numpy.array(y_preds) diff --git a/src/concrete/ml/sklearn/neighbors.py b/src/concrete/ml/sklearn/neighbors.py index 3fed38276..368c9690b 100644 --- a/src/concrete/ml/sklearn/neighbors.py +++ b/src/concrete/ml/sklearn/neighbors.py @@ -61,7 +61,7 @@ def __init__( self.weights = weights def dump_dict(self) -> Dict[str, Any]: - assert self._q_X_fit_quantizer is not None, self._is_not_fitted_error_message() + assert self._q_fit_X_quantizer is not None, self._is_not_fitted_error_message() metadata: Dict[str, Any] = {} @@ -71,8 +71,8 @@ def dump_dict(self) -> Dict[str, Any]: metadata["_is_fitted"] = self._is_fitted metadata["_is_compiled"] = self._is_compiled metadata["input_quantizers"] = self.input_quantizers - metadata["_q_X_fit_quantizer"] = self._q_X_fit_quantizer - metadata["_q_X_fit"] = self._q_X_fit + metadata["_q_fit_X_quantizer"] = self._q_fit_X_quantizer + metadata["_q_fit_X"] = self._q_fit_X metadata["_y"] = self._y metadata["output_quantizers"] = self.output_quantizers @@ -106,8 +106,8 @@ def load_dict(cls, metadata: Dict): obj._is_compiled = metadata["_is_compiled"] obj.input_quantizers = metadata["input_quantizers"] obj.output_quantizers = metadata["output_quantizers"] - obj._q_X_fit_quantizer = metadata["_q_X_fit_quantizer"] - obj._q_X_fit = metadata["_q_X_fit"] + obj._q_fit_X_quantizer = metadata["_q_fit_X_quantizer"] + obj._q_fit_X = metadata["_q_fit_X"] obj._y = metadata["_y"] obj.onnx_model_ = metadata["onnx_model_"] diff --git a/tests/deployment/test_client_server.py b/tests/deployment/test_client_server.py index 7df681a1a..05c7fd53a 100644 --- a/tests/deployment/test_client_server.py +++ b/tests/deployment/test_client_server.py @@ -14,11 +14,15 @@ from concrete.ml.deployment.fhe_client_server import FHEModelClient, FHEModelDev, FHEModelServer from concrete.ml.pytest.torch_models import FCSmall -from concrete.ml.pytest.utils import instantiate_model_generic, sklearn_models_and_datasets +from concrete.ml.pytest.utils import ( + get_model_name, + instantiate_model_generic, + sklearn_models_and_datasets, +) from concrete.ml.quantization.quantized_module import QuantizedModule from concrete.ml.torch.compile import compile_torch_model -# pylint: disable=too-many-statements +# pylint: disable=too-many-statements,too-many-locals class OnDiskNetwork: @@ -67,7 +71,7 @@ def cleanup(self): @pytest.mark.parametrize("model_class, parameters", sklearn_models_and_datasets) -@pytest.mark.parametrize("n_bits", [2]) +@pytest.mark.parametrize("n_bits", [3]) def test_client_server_sklearn( default_configuration, model_class, @@ -99,10 +103,17 @@ def test_client_server_sklearn( with pytest.raises(AttributeError, match=".* model is not compiled.*"): client_server_simulation(x_train, x_test, model, default_configuration) - # With n_bits = 3, KNN is not compilable fhe_circuit = model.compile( x_train, default_configuration, **extra_params, show_mlir=(n_bits <= 8) ) + + if get_model_name(model) == "KNeighborsClassifier": + # Fit the model + with warnings.catch_warnings(): + # Sometimes, we miss convergence, which is not a problem for our test + warnings.simplefilter("ignore", category=ConvergenceWarning) + model.fit(x, y) + max_bit_width = fhe_circuit.graph.maximum_integer_bit_width() print(f"Max width {max_bit_width}") @@ -259,5 +270,10 @@ def client_server_simulation(x_train, x_test, model, default_configuration): y_pred_on_client_dequantized, y_pred_model_server_ds_dequantized ) + # Make sure the clear predictions are the same for the server + if get_model_name(model) == "KNeighborsClassifier": + y_pred_model_clear = model.predict(x_test, fhe="disable") + numpy.testing.assert_array_equal(y_pred_model_clear, y_pred_model_server_ds_dequantized) + # Clean up network.cleanup() diff --git a/tests/sklearn/test_sklearn_models.py b/tests/sklearn/test_sklearn_models.py index 0342198ba..307e412d3 100644 --- a/tests/sklearn/test_sklearn_models.py +++ b/tests/sklearn/test_sklearn_models.py @@ -1569,7 +1569,8 @@ def check_for_divergent_predictions(x, model, fhe, max_iterations=N_ALLOWED_FHE_ predict_function = ( model.predict_proba if is_classifier_or_partial_classifier(model) - # predict_prob not implemented yet for KNeighborsClassifier + # `predict_prob` not implemented yet for KNeighborsClassifier + # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962 and get_model_name(model) != "KNeighborsClassifier" else model.predict )