zama-ai · jfrery · Feb 20, 2025 · Feb 10, 2025 · Feb 10, 2025 · Feb 10, 2025
@@ -223,6 +223,20 @@ def _apply(self, fn, recurse=True):
         """
         return self
 
+    def _ensure_module_on_device(self, x: torch.Tensor) -> None:
+        """Ensure the private module is on the same device as the input tensor.
+
+        Args:
+            x (torch.Tensor): The input tensor to match device with.
+        """
+        assert self.private_module is not None
+
+        # Check if any parameter is not on the same device as the input tensor
+        if any(
+            param.device != x.device for param in self.private_module.parameters()
+        ):  # pragma: no cover
+            self.private_module = self.private_module.to(x.device)  # pragma: no cover
+
     def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]:
         """Forward pass of the remote module.
 
@@ -272,6 +286,7 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]:
             assert self.private_module is not None
             assert self.calibration_data is not None
             self.calibration_data.append(x.detach())
+            self._ensure_module_on_device(x)
             y = self.private_module(x)
             assert isinstance(y, (QuantTensor, torch.Tensor))
 
@@ -284,6 +299,8 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]:
         elif self.fhe_local_mode == HybridFHEMode.TORCH:
             # Using torch layers
             assert self.private_module is not None
+            # Move private module parameters to same device as input if needed
+            self._ensure_module_on_device(x)
             y = self.private_module(x)
         else:  # pragma:no cover
             # Shouldn't happen
@@ -555,6 +572,7 @@ def compile_model(
         p_error: Optional[float] = None,
         device: str = "cpu",
         configuration: Optional[Configuration] = None,
+        use_dynamic_quantization: bool = False,
     ):
         """Compiles the specific layers to FHE.
 
@@ -569,7 +587,13 @@ def compile_model(
             device: FHE compilation device, can be either 'cpu' or 'cuda'.
             configuration (Configuration): A concrete Configuration object specifying the FHE
                 encryption parameters. If not specified, a default configuration is used.
+            use_dynamic_quantization (bool): If True, use dynamic quantization;
+                otherwise, use static quantization. (only for GLWE backend)
         """
+        assert (
+            has_glwe_backend() or not use_dynamic_quantization
+        ), "Dynamic quantization requires GLWE backend"
+
         # We do a forward pass where we accumulate inputs to use for compilation
         self.set_fhe_mode(HybridFHEMode.CALIBRATE)
 
@@ -600,7 +624,9 @@ def compile_model(
                 # then simply quantize the model without compiling with
                 # Concrete Python.
                 if self._has_only_large_linear_layers and has_glwe_backend():
-                    self.executor = GLWELinearLayerExecutor()
+                    self.executor = GLWELinearLayerExecutor(
+                        use_dynamic_quantization=use_dynamic_quantization
+                    )
                     self.private_q_modules[name] = build_quantized_module(
                         self.private_modules[name],
                         calibration_data_tensor,
@@ -612,7 +638,9 @@ def compile_model(
                     vals = self.private_q_modules[name].quant_layers_dict.values()
                     _, q_op = next(iter(vals))
                     const_inp = q_op.constant_inputs[1]  # Get the weights, the bias is in [2]
-                    const_inp.values = const_inp.qvalues.astype(numpy.float32)
+
+                    if not use_dynamic_quantization:
+                        const_inp.values = const_inp.qvalues.astype(numpy.float32)
                     const_inp.qvalues = const_inp.qvalues.astype(numpy.int16)
                 else:
                     self.private_q_modules[name] = compile_torch_model(