Skip to content

Commit

Permalink
fix: pcc, tests, glwe extensions only on linux
Browse files Browse the repository at this point in the history
  • Loading branch information
andrei-stoian-zama committed Oct 10, 2024
1 parent 9a16901 commit 7f64c9f
Show file tree
Hide file tree
Showing 10 changed files with 260 additions and 171 deletions.
2 changes: 1 addition & 1 deletion docs/deep-learning/fhe_assistant.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ concrete_clf.compile(X, debug_config)

#### 3. Quantization import failed

**Error message**: `Error occurred during quantization aware training (QAT) import [...] Could not determine a unique scale for the quantization!`.
**Error message**: `Error occurred during quantization aware training (QAT) import [...] Are you missing a QuantIdentity layer in your Brevitas model?`.

**Cause**: This error occurs when the model imported as a quantized-aware training model lacks quantization operators. See [this guide](../deep-learning/fhe_friendly_models.md) on how to use Brevitas layers. This error message indicates that some layers do not take inputs quantized through `QuantIdentity` layers.

Expand Down
204 changes: 102 additions & 102 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ python = ">=3.8.1,<3.12"
# https://python-poetry.org/docs/1.7/repositories#project-configuration
# concrete-python = {version="==2.7.0", source = "zama-pypi-cpu"}
concrete-python = {version="==2.8.1", source = "zama-pypi-cpu"}
concrete-ml-extensions = "0.1.2"
concrete-ml-extensions = [
{version = "0.1.2", platform = "linux" }
]
setuptools = "65.6.3"
skops = {version = "0.5.0"}
xgboost = "1.6.2"
Expand Down
13 changes: 12 additions & 1 deletion src/concrete/ml/quantization/base_quantized_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,7 @@ def _prepare_inputs_with_constants(
# QuantizedArrays, else we return the float32 values directly.

curr_input_fill_idx = 0
for input_ in inputs:
for input_idx, input_ in enumerate(inputs):
while prepared_inputs[curr_input_fill_idx] is not None:
curr_input_fill_idx += 1

Expand All @@ -669,6 +669,17 @@ def _prepare_inputs_with_constants(
# This is used by mixing (conv/gemm) or value re-arranging ops (reshape)
input_ = cast(QuantizedArray, input_)
new_input = self._prepare_quantized_input(input_)

# Check that the input quantizer is correct - that it can de-quantize
# values correctly. If it is not, it is added to the list of invalid tensors
# for which an error is raised
if (
new_input.quantizer.is_qat
and not input_.quantizer.is_precomputed_qat
and self.error_tracker is not None
):
self.error_tracker.append(input_idx)

prepared_inputs[curr_input_fill_idx] = new_input
else:
# This is usually used for univariate ops that are fused into TLUs
Expand Down
4 changes: 4 additions & 0 deletions src/concrete/ml/quantization/post_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,9 +515,13 @@ def _quantize_layers(self, *input_calibration_data: numpy.ndarray):

constants: Set[str] = set(self.quant_params.keys())

# Check if the model has only GLWE supported linear layers.
# In this case, use analytical calibration which is much faster
fast_calibration = True
for node in graph.node:
op_type = get_op_type(node)
if op_type == "Constant":
continue
quantized_op_class = ONNX_OPS_TO_QUANTIZED_IMPL[op_type]
if quantized_op_class not in OPS_WITH_GLWE_BACKEND_SUPPORT:
fast_calibration = False
Expand Down
2 changes: 1 addition & 1 deletion src/concrete/ml/quantization/quantized_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def _raise_qat_import_error(bad_qat_ops: List[Tuple[str, str]]):
bad_qat_ops,
)
)
+ "\n\nCould not determine a unique scale for the quantization! "
+ "\n\nAre you missing a QuantIdentity layer in your Brevitas model? "
"Please check the ONNX graph of this model."
)

Expand Down
17 changes: 12 additions & 5 deletions src/concrete/ml/quantization/quantizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,17 @@ def fill_from_kwargs(obj, klass, accept_missing, **kwargs):

# If the structure was created or modified by a call to this function, check
# that it is completely filled
if obj is not None and not accept_missing:
for name in hints:
if getattr(obj, name) is None:
raise TypeError(f"Missing quantizer parameter {name}")
if obj is not None:
all_members_missing = all(getattr(obj, name) is None for name in hints)

if not accept_missing or (accept_missing and not all_members_missing):
missing_params_str = ",".join([name for name in hints if getattr(obj, name) is None])
given_params_str = ",".join([name for name in hints if getattr(obj, name) is not None])
if len(missing_params_str) > 0:
raise TypeError(
f"Missing quantizer parameter {missing_params_str}, "
f"but {given_params_str} were given"
)

# Return the parameter structure and the kwargs with the used parameters removed
return obj, kwargs
Expand Down Expand Up @@ -688,7 +695,7 @@ def quant(self, values: numpy.ndarray) -> numpy.ndarray:
# (where quantizer parameters are available in ONNX layers).
# It is possible to disable this clipping step for specific cases such as quantizing values
# within fully-leveled circuits (where not bounds are needed)
if self.is_qat and not self.no_clipping:
if not self.no_clipping:
# Offset is either 2^(n-1) or 0, but for narrow range
# the values should be clipped to [2^(n-1)+1, .. 2^(n-1)-1], so we add
# one to the minimum value for narrow range
Expand Down
119 changes: 82 additions & 37 deletions src/concrete/ml/torch/hybrid_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# pylint: disable=too-many-lines
import ast
import contextvars
import enum
import io
import json
Expand All @@ -15,15 +16,22 @@
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import concrete_ml_extensions as fhext
try:
import concrete_ml_extensions as fhext

_HAS_GLWE_BACKEND = True
except ImportError: # pragma: no cover
fhext = None
_HAS_GLWE_BACKEND = False

import numpy
import requests
import torch
from brevitas.quant_tensor import QuantTensor
from concrete.fhe import Configuration
from torch import nn

from ..common.utils import MAX_BITWIDTH_BACKWARD_COMPATIBLE, assert_true, to_tuple
from ..common.utils import MAX_BITWIDTH_BACKWARD_COMPATIBLE, to_tuple
from ..deployment.fhe_client_server import FHEModelClient, FHEModelDev, FHEModelServer
from ..quantization.post_training import OPS_WITH_GLWE_BACKEND_SUPPORT
from .compile import (
Expand Down Expand Up @@ -245,6 +253,13 @@ def forward(
return y


# This module member is instantiated by the Hybrid FHE model
# when hybrid FHE forward is called and the GLWE backend is available
_optimized_linear_executor: contextvars.ContextVar[Optional[OptimizedLinearLayerExecutor]] = (
contextvars.ContextVar("optimized_linear_executor")
)


# pylint: disable-next=too-many-instance-attributes
class RemoteModule(nn.Module):
"""A wrapper class for the modules to be evaluated remotely with FHE."""
Expand All @@ -256,7 +271,7 @@ def __init__(
module_name: Optional[str] = None,
model_name: Optional[str] = None,
verbose: int = 0,
optimized_linear_layer_executor: OptimizedLinearLayerExecutor = None,
optimized_linear_execution: bool = False,
):
super().__init__()
self.private_module: Optional[nn.Module] = module
Expand All @@ -271,7 +286,7 @@ def __init__(
self.module_name: Optional[str] = module_name
self.model_name: Optional[str] = model_name
self.verbose = verbose
self.optimized_linear_layer_executor = optimized_linear_layer_executor
self.optimized_linear_execution = optimized_linear_execution

def init_fhe_client(
self, path_to_client: Optional[Path] = None, path_to_keys: Optional[Path] = None
Expand Down Expand Up @@ -387,16 +402,16 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]:
None,
}:
assert self.private_q_module is not None
if self.optimized_linear_layer_executor:
assert_true(
self.fhe_local_mode != HybridFHEMode.SIMULATE,
"When the HybridFHEModel is instantiated with only "
"linear remote layers, fhe=simulate is not supported for now.",
)

try:
optimized_linear_layer_executor = _optimized_linear_executor.get()
except LookupError:
optimized_linear_layer_executor = None

if optimized_linear_layer_executor:
# Delegate to the optimized GLWE executor
y = torch.Tensor(
self.optimized_linear_layer_executor.forward(
optimized_linear_layer_executor.forward(
x.detach().numpy(), self.private_q_module, self.fhe_local_mode
)
)
Expand All @@ -414,7 +429,12 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]:

elif self.fhe_local_mode == HybridFHEMode.REMOTE: # pragma:no cover
# Remote call
assert self.optimized_linear_layer_executor is None, (
try:
optimized_linear_layer_executor = _optimized_linear_executor.get()
except LookupError:
optimized_linear_layer_executor = None

assert optimized_linear_layer_executor is None, (
"Remote optimized linear layers " "are not yet implemented"
)
y = self.remote_call(x)
Expand Down Expand Up @@ -532,8 +552,10 @@ def __init__(
self.model_name = model_name
self.verbose = verbose

self.default_crypto_params_glwe = json.loads(
fhext.default_params() # pylint: disable=no-member
self.default_crypto_params_glwe = (
json.loads(fhext.default_params()) # pylint: disable=no-member
if _HAS_GLWE_BACKEND
else None
)

self._replace_modules()
Expand All @@ -559,35 +581,15 @@ def _replace_modules(self):
if not is_pure_linear_layer:
self._all_layers_are_pure_linear = False

# If all layers are pure linear, enable the GLWE optimization for all layers
# and generate an encryption and compression key for all layers
# as they share crypto-parameters
private_key, compression_key = None, None
if self._all_layers_are_pure_linear:
# pylint: disable-next=no-member
fhext_glwe_crypto_params = fhext.MatmulCryptoParameters.deserialize(
json.dumps(self.default_crypto_params_glwe)
)
# pylint: disable-next=no-member
private_key, compression_key = fhext.create_private_key(fhext_glwe_crypto_params)

for module_name in self.module_names:
# Create the optimized glwe linear layer executor if needed
optimized_linear_executor = None
if self._all_layers_are_pure_linear:
optimized_linear_executor = OptimizedLinearLayerExecutor(
self.default_crypto_params_glwe,
private_key=private_key,
compression_key=compression_key,
)

remote_module = RemoteModule(
module=self.private_modules[module_name],
server_remote_address=self.server_remote_address,
module_name=module_name,
model_name=self.model_name,
verbose=self.verbose,
optimized_linear_layer_executor=optimized_linear_executor,
optimized_linear_execution=self._all_layers_are_pure_linear,
)

self.remote_modules[module_name] = remote_module
Expand All @@ -608,10 +610,50 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor:
Returns:
torch.Tensor: The output tensor.
Raises:
AssertionError: if the execution mode is not supported
"""
self.set_fhe_mode(fhe)

return self.model(x)
# Validate the FHE mode
fhe_mode = HybridFHEMode(fhe)

if _HAS_GLWE_BACKEND and self._all_layers_are_pure_linear:
if fhe_mode == HybridFHEMode.SIMULATE:
raise AssertionError(
"When the HybridFHEModel is instantiated with only "
"linear remote layers, fhe=simulate is not supported for now.",
)

if fhe_mode in (HybridFHEMode.EXECUTE, HybridFHEMode.REMOTE):
# If all layers are pure linear, enable the GLWE optimization for all layers
# and generate an encryption and compression key for all layers
# as they share crypto-parameters
private_key, compression_key = None, None
if self._all_layers_are_pure_linear:
# pylint: disable-next=no-member
fhext_glwe_crypto_params = fhext.MatmulCryptoParameters.deserialize(
json.dumps(self.default_crypto_params_glwe)
)
# pylint: disable-next=no-member
private_key, compression_key = fhext.create_private_key(
fhext_glwe_crypto_params
)

_optimized_linear_executor.set(
OptimizedLinearLayerExecutor(
self.default_crypto_params_glwe,
private_key=private_key,
compression_key=compression_key,
)
)

result = self.model(x)

_optimized_linear_executor.set(None)

return result

def __call__(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor:
"""Call method to run the model locally with a fhe mode.
Expand Down Expand Up @@ -708,7 +750,10 @@ def compile_model(
device=device,
)
else:
if self._all_layers_are_pure_linear:
# If all layers are linear and the GLWE backend is available
# then simply quantize the model without compiling with
# Concrete Python.
if self._all_layers_are_pure_linear and _HAS_GLWE_BACKEND:
self.private_q_modules[name] = build_quantized_module(
self.private_modules[name],
calibration_data_tensor,
Expand Down
5 changes: 4 additions & 1 deletion tests/quantization/test_quantizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ def test_quantized_array_constructor():
value_shape = (10,)
values = numpy.random.uniform(0, 1, size=value_shape)

# Missing rmin/rmax, should be recomputed
QuantizedArray(2, values, stats=None)

# Create an array with precomputed statistics
qarr = QuantizedArray(2, values, stats=None, rmax=2, rmin=-1)

Expand All @@ -127,7 +130,7 @@ def test_quantized_array_constructor():
QuantizedArray(2, values, stats=None, __InvalidParam=2)

# Test an incomplete stats structure, should throw an error
with pytest.raises(TypeError):
with pytest.raises(TypeError, match="Missing quantizer parameter rmin, but rmax were given"):
QuantizedArray(2, values, stats=None, rmax=2)


Expand Down
Loading

0 comments on commit 7f64c9f

Please sign in to comment.