chore/improve_gpt2_use_case (#915)

zama-ai · Oct 10, 2024 · fefc19d · fefc19d
1 parent cf7be06
commit fefc19d
Show file tree

Hide file tree

Showing 9 changed files with 488 additions and 620 deletions.
diff --git a/README.md b/README.md
@@ -207,7 +207,7 @@ Concrete ML built-in models have APIs that are almost identical to their scikit-
 
 - [Encrypted Large Language Model](use_case_examples/llm/): converting a user-defined part of a Large Language Model for encrypted text generation. This demo shows the trade-off between quantization and accuracy for text generation and shows how to run the model in FHE.
 - [Private inference for federated learned models](use_case_examples/federated_learning/): private training of a Logistic Regression model and then importing the model into Concrete ML and performing encrypted prediction.
-- [Titanic](use_case_examples/titanic/KaggleTitanic.ipynb): solving the [Kaggle Titanic competition](https://www.kaggle.com/c/titanic/). Implemented with XGBoost from Concrete ML, this example comes as a companion of the [Kaggle notebook](https://www.kaggle.com/code/concretemlteam/titanic-with-privacy-preserving-machine-learning), and was the subject of a blogpost in [KDnuggets](https://www.kdnuggets.com/2022/08/machine-learning-encrypted-data.html).
+- [Titanic](use_case_examples/titanic/KaggleTitanic.ipynb): solving the [Kaggle Titanic competition](https://www.kaggle.com/c/titanic/). Implemented with XGBoost from Concrete ML, this example comes as a companion of the [Kaggle notebook](https://www.kaggle.com/code/concretemlteam/titanic-with-privacy-preserving-machine-learning).
 - [CIFAR10 FHE-friendly model with Brevitas](use_case_examples/cifar/cifar_brevitas_training): training a VGG9 FHE-compatible neural network using Brevitas, and a script to run the neural network in FHE. Execution in FHE takes ~4 minutes per image and shows an accuracy of 88.7%.
 - [CIFAR10 / CIFAR100 FHE-friendly models with Transfer Learning approach](use_case_examples/cifar/cifar_brevitas_finetuning): series of three notebooks, that convert a pre-trained FP32 VGG11 neural network into a quantized model using Brevitas. The model is fine-tuned on the CIFAR data-sets, converted for FHE execution with Concrete ML and evaluated using FHE simulation. For CIFAR10 and CIFAR100, respectively, our simulations show an accuracy of 90.2% and 68.2%.
 

diff --git a/script/make_utils/nbqa.sh b/script/make_utils/nbqa.sh
@@ -28,7 +28,7 @@ function nbqa_ize()
         #       %matplotlib inline
         # --extend-ignore=DAR is because we don't want to run darglint
         poetry run nbqa flake8 "${NB}" --max-line-length 100 --per-file-ignores="__init__.py:F401" \
-            --ignore=E402 --extend-ignore=DAR
+            --ignore=E402,W503 --extend-ignore=DAR
 
         # With some ignored errors, since we don't care:
         #       that the notebook filename is capitalized (invalid-name)
@@ -46,9 +46,10 @@ function nbqa_ize()
                 --disable=missing-module-docstring --disable=missing-class-docstring \
                 --disable=missing-function-docstring \
                 --disable=wrong-import-position --disable=ungrouped-imports \
-                --disable=wrong-import-order\
+                --disable=wrong-import-order \
                 --extension-pkg-whitelist=numpy --disable=redefined-outer-name \
-                $PYLINT_EXTRA_OPTIONS
+                --disable=line-too-long \
+                ${PYLINT_EXTRA_OPTIONS}
     fi
 }
 
@@ -99,6 +100,4 @@ then
     echo "Running nbqa on ${NOTEBOOK}"
     PYLINT_EXTRA_OPTIONS=""
     nbqa_ize "${NOTEBOOK}" "${PYLINT_EXTRA_OPTIONS}"
-fi
-
-
+fi
diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py
@@ -492,6 +492,8 @@ def compile_model(
         """
         # We do a forward pass where we accumulate inputs to use for compilation
         self.set_fhe_mode(HybridFHEMode.CALIBRATE)
+
+        # Run the model to get the calibration data
         self.model(x)
 
         self.configuration = configuration

diff --git a/src/concrete/ml/torch/lora.py b/src/concrete/ml/torch/lora.py
@@ -1,6 +1,6 @@
 """This module contains classes for LoRA (Low-Rank Adaptation) training and custom layers."""
 
-from typing import List
+from typing import List, Tuple, Union
 
 import torch
 
@@ -32,15 +32,16 @@ class LoraTraining(torch.nn.Module):
 
     Args:
         inference_model (torch.nn.Module): The base model to be fine-tuned.
-
+        n_layers_to_skip (int): Number of layers to skip. Linear layers that do not require
+            gradient to be propagated are skipped. Defaults to 1.
     """
 
-    def __init__(self, inference_model) -> None:
+    def __init__(self, inference_model, n_layers_to_skip: int = 1) -> None:
         super().__init__()
 
         self.inference_model = inference_model
 
-        self.replace_layers_with_custom(self.inference_model)
+        self.replace_layers_with_custom(self.inference_model, n_layers_to_skip)
 
         self.optimizer = None
         self.lr_scheduler = None
@@ -52,29 +53,27 @@ def __init__(self, inference_model) -> None:
         self.run_optimizer = False
 
     @staticmethod
-    def replace_layers_with_custom(model: torch.nn.Module, skip_first: bool = True):
+    def replace_layers_with_custom(model: torch.nn.Module, n_layers_to_skip: int):
         """Replace linear layers with custom ones.
 
         This method replaces eligible linear layers in the model with custom layers
         that are compatible with the LoRA training procedure.
 
         Args:
             model (torch.nn.Module): The model to replace layers in.
-            skip_first (bool): Whether to skip the first eligible layer.
+            n_layers_to_skip (int): Number of layers to skip.
         """
-        # Flag to track if the first layer has been skipped
-        skipped = False
 
         def _replace(module: torch.nn.Module):
-            nonlocal skipped
+            nonlocal n_layers_to_skip
             for name, child in list(module.named_children()):
                 # Skip modules containing "lora" in their name
                 if "lora" in name:
                     continue
 
                 if isinstance(child, LINEAR_LAYERS):
-                    if skip_first and not skipped:
-                        skipped = True
+                    if n_layers_to_skip > 0:
+                        n_layers_to_skip -= 1
 
                         # Skip the first eligible layer
                         continue
@@ -129,38 +128,57 @@ def update_training_parameters(
             self.gradient_accumulation_steps = 1
             self.max_grad_norm = None
 
-    def forward(self, inputs):
+    def forward(
+        self, inputs: Tuple[torch.Tensor, ...]
+    ) -> Tuple[torch.Tensor, Union[torch.Tensor, None]]:
         """Forward pass of the LoRA training module.
 
         Args:
-            inputs: A tuple containing input tensors and labels.
+            inputs (tuple): A tuple containing the input tensors. The first two elements should be
+                            the features and the labels. Additional elements will be passed
+                            to the model as needed.
 
         Returns:
             A tuple containing the loss and gradient norm.
 
         Raises:
             ValueError: If the model does not return a loss when `self.loss_fn` is None.
         """
+        assert (
+            len(inputs) >= 2
+        ), "Expected at least two inputs in the tuple: inputs (x) and targets (y)"
+
         # Remove this once hybrid model supports multiple inputs
         # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4568
-        x, y = inputs
+        # Extract x (input features) and y (labels)
+        x, y = inputs[0], inputs[1]
 
-        # Forward pass
-        if self.loss_fn is None:
+        # Additional inputs, if any (e.g., attention_mask)
+        additional_inputs = inputs[2:]
 
-            # Assume model computes loss internally
-            outputs = self.inference_model(x, labels=y)
+        # If no loss function is provided, we assume the model can compute the loss internally
+        if self.loss_fn is None:
+            # Forward pass through the inference model with labels
+            outputs = self.inference_model(x, labels=y, *additional_inputs)
 
-            # Use getattr to safely access the loss attribute
+            # Use getattr to safely access the loss attribute from the outputs
             loss = getattr(outputs, "loss", None)
             if loss is None:
                 raise ValueError(
                     "The model did not return a loss. Ensure that 'labels' are correctly provided."
                 )
         else:
-            outputs = self.inference_model(x)
+            # Forward pass through the inference model without labels
+            outputs = self.inference_model(x, *additional_inputs)
+
+            # If the outputs contain several keys, extract the logits
+            if isinstance(outputs, dict) and "logits" in outputs:
+                outputs = outputs["logits"]
+
+            # Compute the loss using the provided loss function
             loss = self.loss_fn(outputs, y)
 
+        # Scale the loss based on gradient accumulation
         loss = loss / self.gradient_accumulation_steps
 
         # Update gradients
@@ -188,7 +206,7 @@ def forward(self, inputs):
         elif self.calibrate:
             self.inference_model.zero_grad()
 
-        return (loss, grad_norm)
+        return loss, grad_norm
 
     def toggle_calibrate(self, enable: bool = True):
         """Toggle calibration mode.

diff --git a/tests/torch/test_lora.py b/tests/torch/test_lora.py
@@ -74,7 +74,7 @@ def forward(self, x, labels=None):
             loss = ((logits - labels) ** 2).mean()
             Output = namedtuple("Output", ["loss"])
             return Output(loss=loss)
-        return logits
+        return {"logits": logits, "something_else": torch.tensor(1.0)}
 
 
 @pytest.fixture
@@ -89,20 +89,20 @@ def base_lora_training(base_inference_model):
     return LoraTraining(base_inference_model)
 
 
-@pytest.mark.parametrize("skip_first", [True, False])
-def test_lora_training_replace_layers(base_lora_training, skip_first):
+@pytest.mark.parametrize("n_layers_to_skip", [0, 1, 2])
+def test_lora_training_replace_layers(base_lora_training, n_layers_to_skip):
     """Test that LoraTraining replaces layers correctly."""
     original_linear1 = base_lora_training.inference_model.linear1
     original_lora_layer = base_lora_training.inference_model.lora_layer
 
     # Replace layers with custom layers
     base_lora_training.replace_layers_with_custom(
-        base_lora_training.inference_model, skip_first=skip_first
+        base_lora_training.inference_model, n_layers_to_skip=n_layers_to_skip
     )
 
     inference_model = base_lora_training.inference_model
 
-    if skip_first:
+    if n_layers_to_skip > 0:
         # First eligible layer should be skipped
         assert inference_model.linear1 is original_linear1
     else:
@@ -169,7 +169,7 @@ def test_lora_training_forward_with_loss_fn(base_lora_training):
     y = torch.tensor([[0.5, 1.5]])
 
     outputs = base_lora_training.inference_model(x)
-    expected_loss = loss_fn(outputs, y) / base_lora_training.gradient_accumulation_steps
+    expected_loss = loss_fn(outputs["logits"], y) / base_lora_training.gradient_accumulation_steps
 
     loss, _ = base_lora_training((x, y))
 
@@ -225,7 +225,7 @@ def test_lora_training_forward_with_optimizer(base_lora_training):
         SimpleNamespace(gradient_accumulation_steps=1, max_grad_norm=1.0),
     )
     base_lora_training.replace_layers_with_custom(
-        base_lora_training.inference_model, skip_first=False
+        base_lora_training.inference_model, n_layers_to_skip=0
     )
     base_lora_training.toggle_run_optimizer(True)