Skip to content

Commit

Permalink
chore: force the configuration of KNN to run under MONO settings
Browse files Browse the repository at this point in the history
  • Loading branch information
kcelia committed Sep 19, 2023
1 parent ca03c3c commit 9d0a4dd
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 183 deletions.
2 changes: 2 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,8 @@ def check_is_good_execution_for_cml_vs_circuit_impl(
# `check_subfunctions_in_fhe`
if is_classifier_or_partial_classifier(model):
if isinstance(model, SklearnKNeighborsMixin):
# For KNN `predict_proba` is not supported for now
# FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
results_cnp_circuit = model.predict(*inputs, fhe=fhe_mode)
results_model = model.predict(*inputs, fhe="disable")
else:
Expand Down
20 changes: 2 additions & 18 deletions src/concrete/ml/search_parameters/p_error_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,9 @@

import numpy
import torch
from concrete.fhe import ParameterSelectionStrategy
from concrete.fhe.compilation import Configuration
from tqdm import tqdm

from ..common.utils import get_model_name, is_brevitas_model, is_model_class_in_a_list
from ..common.utils import is_brevitas_model, is_model_class_in_a_list
from ..sklearn import (
get_sklearn_neighbors_models,
get_sklearn_neural_net_models,
Expand Down Expand Up @@ -110,16 +108,6 @@ def compile_and_simulated_fhe_inference(
"""

compile_params: Dict = {}

default_configuration = Configuration(
dump_artifacts_on_unexpected_failures=False,
enable_unsafe_features=True,
use_insecure_key_cache=True,
insecure_key_cache_location="ConcreteNumpyKeyCache",
parameter_selection_strategy=ParameterSelectionStrategy.MONO
if get_model_name(estimator) == "KNeighborsClassifier"
else ParameterSelectionStrategy.MULTI,
)
compile_function: Callable[..., Any]
dequantized_output: numpy.ndarray

Expand Down Expand Up @@ -150,11 +138,7 @@ def compile_and_simulated_fhe_inference(
if not estimator.is_fitted:
estimator.fit(calibration_data, ground_truth)

estimator.compile(
calibration_data,
p_error=p_error,
configuration=default_configuration,
)
estimator.compile(calibration_data, p_error=p_error)
predict_method = getattr(estimator, predict)
dequantized_output = predict_method(calibration_data, fhe="simulate")

Expand Down
144 changes: 75 additions & 69 deletions src/concrete/ml/sklearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,6 @@ def predict(self, X: Data, fhe: Union[FheMode, str] = FheMode.DISABLE) -> numpy.
for q_X_i in q_X:
# Expected encrypt_run_decrypt output shape is (1, n_features) while q_X_i
# is of shape (n_features,)

q_X_i = numpy.expand_dims(q_X_i, 0)

# For mypy, even though we already check this with self.check_model_is_compiled()
Expand Down Expand Up @@ -1697,7 +1696,7 @@ def predict_proba(self, X: Data, fhe: Union[FheMode, str] = FheMode.DISABLE) ->
return y_proba


# pylint: disable=invalid-name,too-many-instance-attributes
# pylint: disable-next=invalid-name,too-many-instance-attributes
class SklearnKNeighborsMixin(BaseEstimator, sklearn.base.BaseEstimator, ABC):
"""A Mixin class for sklearn KNeighbors models with FHE.
Expand All @@ -1712,24 +1711,22 @@ def __init_subclass__(cls):
_NEIGHBORS_MODELS.add(cls)
_ALL_SKLEARN_MODELS.add(cls)

def __init__(self, n_bits: Union[int, Dict[str, int]] = 3):
def __init__(self, n_bits: int = 3):
"""Initialize the FHE knn model.
Args:
n_bits (int, Dict[str, int]): Number of bits to quantize the model. If an int is passed
for n_bits, the value will be used for quantizing inputs and weights. If a dict is
passed, then it should contain "op_inputs" and "op_weights" as keys with
corresponding number of quantization bits so that:
- op_inputs : number of bits to quantize the input values
- op_weights: number of bits to quantize the learned parameters
Default to 3.
n_bits (int): Number of bits to quantize the model. IThe value will be used for
quantizing inputs and X_fit. Default to 3.
"""
self.n_bits: Union[int, Dict[str, int]] = n_bits

#: The quantizer to use for quantizing the model's weights
self._weight_quantizer: Optional[UniformQuantizer] = None
self._q_X_fit_quantizer: Optional[UniformQuantizer] = None
self.n_bits: int = n_bits
# _q_X_fit: In distance metric algorithms, `_q_X_fit` stores the training set to compute
# the similarity or distance measures. There is no `weights` attribute because there isn't
# a training phase
self._q_X_fit: numpy.ndarray
# _y: Labels of `_q_X_fit`
self._y: numpy.ndarray
# _q_X_fit_quantizer: The quantizer to use for quantizing the model's training set
self._q_X_fit_quantizer: Optional[UniformQuantizer] = None

BaseEstimator.__init__(self)

Expand All @@ -1748,7 +1745,7 @@ def _set_onnx_model(self, test_input: numpy.ndarray) -> None:
test_input=test_input,
extra_config={
"onnx_target_opset": OPSET_VERSION_FOR_ONNX_EXPORT,
# pylint: disable=protected-access, no-member
# pylint: disable-next=protected-access, no-member
constants.BATCH_SIZE: self.sklearn_model._fit_X.shape[0],
},
).model
Expand All @@ -1765,6 +1762,8 @@ def _clean_graph(self) -> None:
def fit(self, X: Data, y: Target, **fit_parameters):
# Reset for double fit
self._is_fitted = False
self.input_quantizers = []
self.output_quantizers = []

# KNeighbors handles multi-labels data
X, y = check_X_y_and_assert_multi_output(X, y)
Expand All @@ -1780,31 +1779,23 @@ def fit(self, X: Data, y: Target, **fit_parameters):
# Retrieve the ONNX graph
self._set_onnx_model(X)

# Convert the n_bits attribute into a proper dictionary
n_bits = get_n_bits_dict(self.n_bits)

input_n_bits = n_bits["op_inputs"]
input_options = QuantizationOptions(n_bits=input_n_bits, is_signed=True)

# Quantize the inputs and store the associated quantizer
q_inputs = QuantizedArray(n_bits=input_n_bits, values=X, options=input_options)
input_options = QuantizationOptions(n_bits=self.n_bits, is_signed=True)
q_inputs = QuantizedArray(n_bits=self.n_bits, values=X, options=input_options)
input_quantizer = q_inputs.quantizer
self.input_quantizers.append(input_quantizer)

weights_n_bits = n_bits["op_weights"]
weight_options = QuantizationOptions(n_bits=weights_n_bits, is_signed=True)

# Quantize the _X_fit and store the associated quantizer
# Weights in KNN algorithms are the train data points
# pylint: disable=protected-access
# pylint: disable-next=protected-access
_X_fit = self.sklearn_model._fit_X
# We assume that the inputs have the same distribution as the _X_fit
q_X_fit = QuantizedArray(
n_bits=n_bits["op_weights"],
n_bits=self.n_bits,
values=numpy.expand_dims(_X_fit, axis=1) if len(_X_fit.shape) == 1 else _X_fit,
options=weight_options,
options=input_options,
)
self._q_X_fit = q_X_fit.qvalues
self._q_X_fit_quantizer = self._weight_quantizer = q_X_fit.quantizer
self._q_X_fit_quantizer = q_X_fit.quantizer

# mypy
assert self._q_X_fit_quantizer.scale is not None
Expand All @@ -1821,9 +1812,6 @@ def fit(self, X: Data, y: Target, **fit_parameters):

output_quantizer = UniformQuantizer(params=self.output_quant_params, no_clipping=True)

# Since the matmul and the bias both use the same scale and zero-points, we obtain that
# y = S*(q_y - 2*Z) when de-quantizing the values. We therefore need to multiply the initial
# output zero_point by 2
assert output_quantizer.zero_point is not None
self.output_quantizers.append(output_quantizer)

Expand All @@ -1843,14 +1831,8 @@ def quantize_input(self, X: numpy.ndarray) -> numpy.ndarray:

def dequantize_output(self, q_y_preds: numpy.ndarray) -> numpy.ndarray:
self.check_model_is_fitted()

# We compute the sorted argmax in FHE, which are integers.
# No need to de-quantize the output values

assert q_y_preds[0].shape[-1] == self.n_neighbors, (
f"Shape error: `q_y_preds` must be shape of ({self.n_neighbors},) and got:"
f"`{q_y_preds.shape}`"
)
return q_y_preds

def _get_module_to_compile(self) -> Union[Compiler, QuantizedModule]:
Expand Down Expand Up @@ -1911,6 +1893,8 @@ def pairwise_euclidean_distance(q_X):
def topk_sorting(x):
"""Argsort in FHE.
Time complexity: O(nlog²(k))
Args:
x (numpy.ndarray): The quantized input values.
Expand Down Expand Up @@ -1951,78 +1935,81 @@ def scatter1d(x, v, indices):
x[i] = v[idx]
return x

def mul_tlu(a, b):
"""Matrix multiplication.
Args:
a (numpy.ndarray): An encrypted array
b (numpy.ndarray): An encrypted array
Returns:
numpy.ndarray: The result of a * b
"""
return a * b

comparisons = numpy.zeros(x.shape)
idx = numpy.arange(x.size) + fhe_zeros(x.shape)

n, k = x.size, self.n_neighbors
ln2n = int(numpy.ceil(numpy.log2(n)))

# Number of stages
for t in range(ln2n - 1, -1, -1):
p = 2**t
r = 0
# d: Length of the bitonic sequence
d = p

for bq in range(ln2n - 1, t - 1, -1):
q = 2**bq
# Determine the range of indexes to be compared
range_i = numpy.array(
[i for i in range(0, n - d) if i & p == r and comparisons[i] < k]
)
if len(range_i) == 0:
# Edge case, for k=1
continue

a = gather1d(x, range_i) # x[range_i]
a_i = gather1d(idx, range_i) # idx[range_i]
b = gather1d(x, range_i + d) # x[range_i + d]
b_i = gather1d(idx, range_i + d) # idx[range_i + d]
# Select 2 bitonic sequences `a` and `b` of length `d`
# a = x[range_i]: first bitonic sequence
a = gather1d(x, range_i)
a_i = gather1d(idx, range_i)
# b = x[range_i + d]: Second bitonic sequence
# b_i = idx[range_i]: Indexes of a_i elements in the original x
b = gather1d(x, range_i + d)
b_i = gather1d(idx, range_i + d)

# Select max(a, b)
diff = a - b
sign = diff < 0

max_x = a + numpy.maximum(0, b - a)
x = scatter1d(x, a + b - max_x, range_i) # x[range_i] = a + b - max_x
x = scatter1d(x, max_x, range_i + d) # x[range_i + d] = max_x

max_idx = a_i + mul_tlu((b_i - a_i), sign)
# Swap if a > b
# x[range_i] = max_x(a, b): First bitonic sequence gets min(a, b)
x = scatter1d(x, a + b - max_x, range_i)
# x[range_i + d] = min(a, b): Second bitonic sequence gets max(a, b)
x = scatter1d(x, max_x, range_i + d)

# Max index selection
sign = diff < 0
max_idx = a_i + (b_i - a_i) * sign

# idx[range_i] = a_i + b_i - max_idx
# Update indexes array according to the max items
# idx[range_i] = a_i + b_i - max_idx <=> min_idx
idx = scatter1d(idx, a_i + b_i - max_idx, range_i)
idx = scatter1d(idx, max_idx, range_i + d) # idx[range_i + d] = max_idx
# idx[range_i + d] = max_idx
idx = scatter1d(idx, max_idx, range_i + d)

# Update
comparisons[range_i + d] = comparisons[range_i + d] + 1

d = q - p
r = p

# Return only the topk indexes
topk_indexes = []
for i in range((self.n_neighbors)):
topk_indexes.append(idx[i])

topk_indexes = fhe_array(topk_indexes)

assert topk_indexes.shape[0] == self.n_neighbors

return topk_indexes

# 1. Pairwise_euclidiean distance
# from concrete import fhe
# with fhe.tag(f"distance_matrix"):
distance_matrix = pairwise_euclidean_distance(q_X)

# The square root in the Euclidean distance calculation is not applied.
# The square root in the Euclidean distance calculation is not applied to speed up FHE
# computations.
# Being a monotonic function, it does not affect the logic of the calculation, notably for
# for the argsort
# the argsort.

# 2. Sorting args
# with fhe.tag(f"sorted_args"):
Expand All @@ -2031,6 +2018,25 @@ def mul_tlu(a, b):

return numpy.expand_dims(sorted_args, axis=0)

# KNN works only for MONO in the latest concrete Python version
# FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3978
def compile(self, *args, **kwargs) -> Circuit:
# If a configuration instance is given as a positional parameter, set the strategy to
# multi-parameter
if len(args) >= 2:
configuration = force_mono_parameter_in_configuration(args[1])
args_list = list(args)
args_list[1] = configuration
args = tuple(args_list)

# Else, retrieve the configuration in kwargs if it exists, or create a new one, and set the
# strategy to multi-parameter
else:
configuration = kwargs.get("configuration", None)
kwargs["configuration"] = force_mono_parameter_in_configuration(configuration)

return BaseEstimator.compile(self, *args, **kwargs)

def predict(self, X: Data, fhe: Union[FheMode, str] = FheMode.DISABLE) -> numpy.ndarray:

X = check_array_and_assert(X)
Expand All @@ -2040,7 +2046,7 @@ def predict(self, X: Data, fhe: Union[FheMode, str] = FheMode.DISABLE) -> numpy.
# Argsort
arg_sort = super().predict(query[None], fhe)
# Majority vote
# pylint: disable=protected-access
# pylint: disable-next=protected-access
label_indices = self._y[arg_sort.flatten()]
y_pred = self.majority_vote(label_indices)
y_preds.append(y_pred)
Expand Down
Loading

0 comments on commit 9d0a4dd

Please sign in to comment.