fix: pcc, tests, glwe extensions only on linux

zama-ai · Oct 10, 2024 · 7f64c9f · 7f64c9f
1 parent 9a16901
commit 7f64c9f
Show file tree

Hide file tree

Showing 10 changed files with 260 additions and 171 deletions.
diff --git a/docs/deep-learning/fhe_assistant.md b/docs/deep-learning/fhe_assistant.md
@@ -77,7 +77,7 @@ concrete_clf.compile(X, debug_config)
 
 #### 3. Quantization import failed
 
-**Error message**: `Error occurred during quantization aware training (QAT) import [...] Could not determine a unique scale for the quantization!`.
+**Error message**: `Error occurred during quantization aware training (QAT) import [...] Are you missing a QuantIdentity layer in your Brevitas model?`.
 
 **Cause**: This error occurs when the model imported as a quantized-aware training model lacks quantization operators. See [this guide](../deep-learning/fhe_friendly_models.md) on how to use Brevitas layers. This error message indicates that some layers do not take inputs quantized through `QuantIdentity` layers.
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,9 @@ python = ">=3.8.1,<3.12"
 # https://python-poetry.org/docs/1.7/repositories#project-configuration
 # concrete-python = {version="==2.7.0", source = "zama-pypi-cpu"}
 concrete-python = {version="==2.8.1", source = "zama-pypi-cpu"}
-concrete-ml-extensions = "0.1.2"
+concrete-ml-extensions = [
+    {version = "0.1.2", platform = "linux" }
+]
 setuptools = "65.6.3"
 skops = {version = "0.5.0"}
 xgboost = "1.6.2"

diff --git a/src/concrete/ml/quantization/base_quantized_op.py b/src/concrete/ml/quantization/base_quantized_op.py
@@ -648,7 +648,7 @@ def _prepare_inputs_with_constants(
         # QuantizedArrays, else we return the float32 values directly.
 
         curr_input_fill_idx = 0
-        for input_ in inputs:
+        for input_idx, input_ in enumerate(inputs):
             while prepared_inputs[curr_input_fill_idx] is not None:
                 curr_input_fill_idx += 1
 
@@ -669,6 +669,17 @@ def _prepare_inputs_with_constants(
                 # This is used by mixing (conv/gemm) or value re-arranging ops (reshape)
                 input_ = cast(QuantizedArray, input_)
                 new_input = self._prepare_quantized_input(input_)
+
+                # Check that the input quantizer is correct - that it can de-quantize
+                # values correctly. If it is not, it is added to the list of invalid tensors
+                # for which an error is raised
+                if (
+                    new_input.quantizer.is_qat
+                    and not input_.quantizer.is_precomputed_qat
+                    and self.error_tracker is not None
+                ):
+                    self.error_tracker.append(input_idx)
+
                 prepared_inputs[curr_input_fill_idx] = new_input
             else:
                 # This is usually used for univariate ops that are fused into TLUs

diff --git a/src/concrete/ml/quantization/post_training.py b/src/concrete/ml/quantization/post_training.py
@@ -515,9 +515,13 @@ def _quantize_layers(self, *input_calibration_data: numpy.ndarray):
 
         constants: Set[str] = set(self.quant_params.keys())
 
+        # Check if the model has only GLWE supported linear layers.
+        # In this case, use analytical calibration which is much faster
         fast_calibration = True
         for node in graph.node:
             op_type = get_op_type(node)
+            if op_type == "Constant":
+                continue
             quantized_op_class = ONNX_OPS_TO_QUANTIZED_IMPL[op_type]
             if quantized_op_class not in OPS_WITH_GLWE_BACKEND_SUPPORT:
                 fast_calibration = False

diff --git a/src/concrete/ml/quantization/quantized_module.py b/src/concrete/ml/quantization/quantized_module.py
@@ -55,7 +55,7 @@ def _raise_qat_import_error(bad_qat_ops: List[Tuple[str, str]]):
                 bad_qat_ops,
             )
         )
-        + "\n\nCould not determine a unique scale for the quantization! "
+        + "\n\nAre you missing a QuantIdentity layer in your Brevitas model? "
         "Please check the ONNX graph of this model."
     )
 

diff --git a/src/concrete/ml/quantization/quantizers.py b/src/concrete/ml/quantization/quantizers.py
@@ -62,10 +62,17 @@ def fill_from_kwargs(obj, klass, accept_missing, **kwargs):
 
     # If the structure was created or modified by a call to this function, check
     # that it is completely filled
-    if obj is not None and not accept_missing:
-        for name in hints:
-            if getattr(obj, name) is None:
-                raise TypeError(f"Missing quantizer parameter {name}")
+    if obj is not None:
+        all_members_missing = all(getattr(obj, name) is None for name in hints)
+
+        if not accept_missing or (accept_missing and not all_members_missing):
+            missing_params_str = ",".join([name for name in hints if getattr(obj, name) is None])
+            given_params_str = ",".join([name for name in hints if getattr(obj, name) is not None])
+            if len(missing_params_str) > 0:
+                raise TypeError(
+                    f"Missing quantizer parameter {missing_params_str}, "
+                    f"but {given_params_str} were given"
+                )
 
     # Return the parameter structure and the kwargs with the used parameters removed
     return obj, kwargs
@@ -688,7 +695,7 @@ def quant(self, values: numpy.ndarray) -> numpy.ndarray:
         # (where quantizer parameters are available in ONNX layers).
         # It is possible to disable this clipping step for specific cases such as quantizing values
         # within fully-leveled circuits (where not bounds are needed)
-        if self.is_qat and not self.no_clipping:
+        if not self.no_clipping:
             # Offset is either 2^(n-1) or 0, but for narrow range
             # the values should be clipped to [2^(n-1)+1, .. 2^(n-1)-1], so we add
             # one to the minimum value for narrow range

diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py
@@ -2,6 +2,7 @@
 
 # pylint: disable=too-many-lines
 import ast
+import contextvars
 import enum
 import io
 import json
@@ -15,15 +16,22 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 
-import concrete_ml_extensions as fhext
+try:
+    import concrete_ml_extensions as fhext
+
+    _HAS_GLWE_BACKEND = True
+except ImportError:  # pragma: no cover
+    fhext = None
+    _HAS_GLWE_BACKEND = False
+
 import numpy
 import requests
 import torch
 from brevitas.quant_tensor import QuantTensor
 from concrete.fhe import Configuration
 from torch import nn
 
-from ..common.utils import MAX_BITWIDTH_BACKWARD_COMPATIBLE, assert_true, to_tuple
+from ..common.utils import MAX_BITWIDTH_BACKWARD_COMPATIBLE, to_tuple
 from ..deployment.fhe_client_server import FHEModelClient, FHEModelDev, FHEModelServer
 from ..quantization.post_training import OPS_WITH_GLWE_BACKEND_SUPPORT
 from .compile import (
@@ -245,6 +253,13 @@ def forward(
         return y
 
 
+# This module member is instantiated by the Hybrid FHE model
+# when hybrid FHE forward is called and the GLWE backend is available
+_optimized_linear_executor: contextvars.ContextVar[Optional[OptimizedLinearLayerExecutor]] = (
+    contextvars.ContextVar("optimized_linear_executor")
+)
+
+
 # pylint: disable-next=too-many-instance-attributes
 class RemoteModule(nn.Module):
     """A wrapper class for the modules to be evaluated remotely with FHE."""
@@ -256,7 +271,7 @@ def __init__(
         module_name: Optional[str] = None,
         model_name: Optional[str] = None,
         verbose: int = 0,
-        optimized_linear_layer_executor: OptimizedLinearLayerExecutor = None,
+        optimized_linear_execution: bool = False,
     ):
         super().__init__()
         self.private_module: Optional[nn.Module] = module
@@ -271,7 +286,7 @@ def __init__(
         self.module_name: Optional[str] = module_name
         self.model_name: Optional[str] = model_name
         self.verbose = verbose
-        self.optimized_linear_layer_executor = optimized_linear_layer_executor
+        self.optimized_linear_execution = optimized_linear_execution
 
     def init_fhe_client(
         self, path_to_client: Optional[Path] = None, path_to_keys: Optional[Path] = None
@@ -387,16 +402,16 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]:
             None,
         }:
             assert self.private_q_module is not None
-            if self.optimized_linear_layer_executor:
-                assert_true(
-                    self.fhe_local_mode != HybridFHEMode.SIMULATE,
-                    "When the HybridFHEModel is instantiated with only "
-                    "linear remote layers, fhe=simulate is not supported for now.",
-                )
 
+            try:
+                optimized_linear_layer_executor = _optimized_linear_executor.get()
+            except LookupError:
+                optimized_linear_layer_executor = None
+
+            if optimized_linear_layer_executor:
                 # Delegate to the optimized GLWE executor
                 y = torch.Tensor(
-                    self.optimized_linear_layer_executor.forward(
+                    optimized_linear_layer_executor.forward(
                         x.detach().numpy(), self.private_q_module, self.fhe_local_mode
                     )
                 )
@@ -414,7 +429,12 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]:
 
         elif self.fhe_local_mode == HybridFHEMode.REMOTE:  # pragma:no cover
             # Remote call
-            assert self.optimized_linear_layer_executor is None, (
+            try:
+                optimized_linear_layer_executor = _optimized_linear_executor.get()
+            except LookupError:
+                optimized_linear_layer_executor = None
+
+            assert optimized_linear_layer_executor is None, (
                 "Remote optimized linear layers " "are not yet implemented"
             )
             y = self.remote_call(x)
@@ -532,8 +552,10 @@ def __init__(
         self.model_name = model_name
         self.verbose = verbose
 
-        self.default_crypto_params_glwe = json.loads(
-            fhext.default_params()  # pylint: disable=no-member
+        self.default_crypto_params_glwe = (
+            json.loads(fhext.default_params())  # pylint: disable=no-member
+            if _HAS_GLWE_BACKEND
+            else None
         )
 
         self._replace_modules()
@@ -559,35 +581,15 @@ def _replace_modules(self):
             if not is_pure_linear_layer:
                 self._all_layers_are_pure_linear = False
 
-        # If all layers are pure linear, enable the GLWE optimization for all layers
-        # and generate an encryption and compression key for all layers
-        # as they share crypto-parameters
-        private_key, compression_key = None, None
-        if self._all_layers_are_pure_linear:
-            # pylint: disable-next=no-member
-            fhext_glwe_crypto_params = fhext.MatmulCryptoParameters.deserialize(
-                json.dumps(self.default_crypto_params_glwe)
-            )
-            # pylint: disable-next=no-member
-            private_key, compression_key = fhext.create_private_key(fhext_glwe_crypto_params)
-
         for module_name in self.module_names:
             # Create the optimized glwe linear layer executor if needed
-            optimized_linear_executor = None
-            if self._all_layers_are_pure_linear:
-                optimized_linear_executor = OptimizedLinearLayerExecutor(
-                    self.default_crypto_params_glwe,
-                    private_key=private_key,
-                    compression_key=compression_key,
-                )
-
             remote_module = RemoteModule(
                 module=self.private_modules[module_name],
                 server_remote_address=self.server_remote_address,
                 module_name=module_name,
                 model_name=self.model_name,
                 verbose=self.verbose,
-                optimized_linear_layer_executor=optimized_linear_executor,
+                optimized_linear_execution=self._all_layers_are_pure_linear,
             )
 
             self.remote_modules[module_name] = remote_module
@@ -608,10 +610,50 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor:
 
         Returns:
             torch.Tensor: The output tensor.
+
+        Raises:
+            AssertionError: if the execution mode is not supported
         """
         self.set_fhe_mode(fhe)
 
-        return self.model(x)
+        # Validate the FHE mode
+        fhe_mode = HybridFHEMode(fhe)
+
+        if _HAS_GLWE_BACKEND and self._all_layers_are_pure_linear:
+            if fhe_mode == HybridFHEMode.SIMULATE:
+                raise AssertionError(
+                    "When the HybridFHEModel is instantiated with only "
+                    "linear remote layers, fhe=simulate is not supported for now.",
+                )
+
+            if fhe_mode in (HybridFHEMode.EXECUTE, HybridFHEMode.REMOTE):
+                # If all layers are pure linear, enable the GLWE optimization for all layers
+                # and generate an encryption and compression key for all layers
+                # as they share crypto-parameters
+                private_key, compression_key = None, None
+                if self._all_layers_are_pure_linear:
+                    # pylint: disable-next=no-member
+                    fhext_glwe_crypto_params = fhext.MatmulCryptoParameters.deserialize(
+                        json.dumps(self.default_crypto_params_glwe)
+                    )
+                    # pylint: disable-next=no-member
+                    private_key, compression_key = fhext.create_private_key(
+                        fhext_glwe_crypto_params
+                    )
+
+                _optimized_linear_executor.set(
+                    OptimizedLinearLayerExecutor(
+                        self.default_crypto_params_glwe,
+                        private_key=private_key,
+                        compression_key=compression_key,
+                    )
+                )
+
+        result = self.model(x)
+
+        _optimized_linear_executor.set(None)
+
+        return result
 
     def __call__(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor:
         """Call method to run the model locally with a fhe mode.
@@ -708,7 +750,10 @@ def compile_model(
                     device=device,
                 )
             else:
-                if self._all_layers_are_pure_linear:
+                # If all layers are linear and the GLWE backend is available
+                # then simply quantize the model without compiling with
+                # Concrete Python.
+                if self._all_layers_are_pure_linear and _HAS_GLWE_BACKEND:
                     self.private_q_modules[name] = build_quantized_module(
                         self.private_modules[name],
                         calibration_data_tensor,

diff --git a/tests/quantization/test_quantizers.py b/tests/quantization/test_quantizers.py
@@ -115,6 +115,9 @@ def test_quantized_array_constructor():
     value_shape = (10,)
     values = numpy.random.uniform(0, 1, size=value_shape)
 
+    # Missing rmin/rmax, should be recomputed
+    QuantizedArray(2, values, stats=None)
+
     # Create an array with precomputed statistics
     qarr = QuantizedArray(2, values, stats=None, rmax=2, rmin=-1)
 
@@ -127,7 +130,7 @@ def test_quantized_array_constructor():
         QuantizedArray(2, values, stats=None, __InvalidParam=2)
 
     # Test an incomplete stats structure, should throw an error
-    with pytest.raises(TypeError):
+    with pytest.raises(TypeError, match="Missing quantizer parameter rmin, but rmax were given"):
         QuantizedArray(2, values, stats=None, rmax=2)