From b75c00111127984e626684919775fd15ed3eeb9f Mon Sep 17 00:00:00 2001
From: donghaoran <donghaoran5@huawei.com>
Date: Thu, 26 Sep 2024 17:22:34 +0800
Subject: [PATCH 01/22] adaption for moe models

---
 src/peft/tuners/lora/tp_layer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/peft/tuners/lora/tp_layer.py b/src/peft/tuners/lora/tp_layer.py
index 394f3af2dd..e645c50d59 100644
--- a/src/peft/tuners/lora/tp_layer.py
+++ b/src/peft/tuners/lora/tp_layer.py
@@ -62,6 +62,7 @@ def __init__(
         self.is_parallel_a = isinstance(base_layer, backend.RowParallelLinear)
         self.fan_in_fan_out = fan_in_fan_out
         self._active_adapter = adapter_name
+        self.is_expert = base_layer.is_expert
 
         megatron_config = kwargs["megatron_config"]
         parallel_linear_kwargs = {"megatron_config": megatron_config}
@@ -131,6 +132,7 @@ def update_layer(
                 skip_bias_add=True,
                 init_method=init_method,
                 config=megatron_config,
+                is_expert=self.is_expert,
             )
             lora_b = nn.Linear(in_features=r, out_features=self.out_features, bias=False, dtype=torch.float32)
         else:
@@ -142,6 +144,7 @@ def update_layer(
                 gather_output=gather_output,
                 init_method=init_method,
                 config=megatron_config,
+                is_expert=self.is_expert,
             )
         self.lora_A[adapter_name] = lora_a
         self.lora_B[adapter_name] = lora_b

From c29810bad23f24c637d8e0bce42866d6748d94e2 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Fri, 27 Sep 2024 16:17:39 +0200
Subject: [PATCH 02/22] FIX: Change check if past_key_values is empty (#2106)

After transformers merged this PR:

https://github.com/huggingface/transformers/pull/33703

The bool of past_key_values (a Cache instance) would change from False
to True in one of our checks. Use get_seq_length() method instead, which
is consistent before and after that commit.

I checked the tests with the new change for both transformers before and
after that commit and they passed, so this change should be backwards
compatible.

Unrelated change: Mark X-LoRA scaling test as xfail-ing for now.

This should be addressed in a separate PR. Marking it to xfail for now
to get the original fix through CI.
---
 src/peft/peft_model.py | 3 ++-
 tests/test_xlora.py    | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
index 3a09200217..fb567ebfb3 100644
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@@ -1776,7 +1776,8 @@ def prepare_inputs_for_generation(self, *args, task_ids: Optional[torch.Tensor]
 
             # no past_key_values or past_key_values empty cache
             requires_prompt_injection = (model_kwargs["past_key_values"] is None) or (
-                isinstance(model_kwargs["past_key_values"], transformers.Cache) and not model_kwargs["past_key_values"]
+                isinstance(model_kwargs["past_key_values"], transformers.Cache)
+                and not model_kwargs["past_key_values"].get_seq_length()
             )
 
             if requires_prompt_injection and peft_config.peft_type == PeftType.PREFIX_TUNING:
diff --git a/tests/test_xlora.py b/tests/test_xlora.py
index b84635e6ec..7b70a4b240 100644
--- a/tests/test_xlora.py
+++ b/tests/test_xlora.py
@@ -135,6 +135,7 @@ def test_functional(self, tokenizer, model):
 
     # TODO: remove the skip when 4.45 is released!
     @pytest.mark.skipif(not uses_transformers_4_45, reason="Requires transformers >= 4.45")
+    @pytest.mark.xfail
     def test_scalings_logging_methods(self, tokenizer, model):
         model.enable_scalings_logging()
 

From aa3bd8fbf6820bb000e148a289b6b0c48c339774 Mon Sep 17 00:00:00 2001
From: Aliakbar <34204311+Salehbigdeli@users.noreply.github.com>
Date: Mon, 30 Sep 2024 11:03:41 +0200
Subject: [PATCH 03/22] DOC Update source install instruction (#2110)

---
 docs/source/install.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/install.md b/docs/source/install.md
index c1f435a5ef..d89c2da7f6 100644
--- a/docs/source/install.md
+++ b/docs/source/install.md
@@ -43,5 +43,5 @@ repository:
 ```bash
 git clone https://github.com/huggingface/peft
 cd peft
-pip install -e .
+pip install -e .[test]
 ```

From 2a807359bd34df64dd8dd29c1bd1024125d13312 Mon Sep 17 00:00:00 2001
From: Zeju1997 <47625089+Zeju1997@users.noreply.github.com>
Date: Tue, 1 Oct 2024 16:51:18 +0200
Subject: [PATCH 04/22] FIX Refactor OFT, small changes to BOFT (#1996)

The previous OFT implementation contained a few errors, which are fixed now.

Unfortunately, this makes previous OFT checkpoints invalid, which is why an
error will be raised. Users are instructed to either retrain the OFT adapter or
switch to an old PEFT version.
---
 src/peft/config.py                      |  10 +
 src/peft/mixed_model.py                 |   2 -
 src/peft/tuners/boft/config.py          |  18 +-
 src/peft/tuners/boft/layer.py           |  32 +-
 src/peft/tuners/oft/config.py           |  93 ++-
 src/peft/tuners/oft/layer.py            | 821 +++++++++++++++++-------
 src/peft/tuners/oft/model.py            | 318 ++++++++-
 tests/test_config.py                    |   4 +-
 tests/test_custom_models.py             |  43 +-
 tests/test_decoder_models.py            |  63 +-
 tests/test_encoder_decoder_models.py    |   2 +
 tests/test_feature_extraction_models.py |   3 +
 tests/test_mixed.py                     |  41 +-
 tests/test_stablediffusion.py           |   4 +-
 tests/test_vision_models.py             |   2 +-
 tests/testing_common.py                 |  14 +-
 16 files changed, 1104 insertions(+), 366 deletions(-)

diff --git a/src/peft/config.py b/src/peft/config.py
index 9cdcb08e9a..7c2ad02fe4 100644
--- a/src/peft/config.py
+++ b/src/peft/config.py
@@ -149,6 +149,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, subfolder: Optional
 
         loaded_attributes = cls.from_json_file(config_file)
         kwargs = {**class_kwargs, **loaded_attributes}
+        kwargs = cls.check_kwargs(**kwargs)
         return cls.from_peft_type(**kwargs)
 
     @classmethod
@@ -213,6 +214,15 @@ def _get_peft_type(
         loaded_attributes = cls.from_json_file(config_file)
         return loaded_attributes["peft_type"]
 
+    @classmethod
+    def check_kwargs(cls, **kwargs):
+        """Check kwargs before initializing the config instance.
+
+        Subclasses can override this method to add specific checks.
+
+        """
+        return kwargs
+
     @property
     def is_prompt_learning(self) -> bool:
         r"""
diff --git a/src/peft/mixed_model.py b/src/peft/mixed_model.py
index d9e231e018..907227de36 100644
--- a/src/peft/mixed_model.py
+++ b/src/peft/mixed_model.py
@@ -34,7 +34,6 @@
     LoKrModel,
     LoraModel,
     MixedModel,
-    OFTModel,
 )
 from .tuners.mixed import COMPATIBLE_TUNER_TYPES
 from .utils import PeftType, _set_adapter, _set_trainable
@@ -46,7 +45,6 @@
     PeftType.LOKR: LoKrModel,
     PeftType.ADALORA: AdaLoraModel,
     PeftType.IA3: IA3Model,
-    PeftType.OFT: OFTModel,
 }
 
 
diff --git a/src/peft/tuners/boft/config.py b/src/peft/tuners/boft/config.py
index ab704b5d95..ecd6a2c13c 100644
--- a/src/peft/tuners/boft/config.py
+++ b/src/peft/tuners/boft/config.py
@@ -32,7 +32,9 @@ class BOFTConfig(PeftConfig):
         boft_block_num (`int`): Number of BOFT blocks per injected layer.
         boft_n_butterfly_factor (`int`): Number of butterfly factors across different layers.
         target_modules (`Union[List[str],str]`): The names of the modules to apply the adapter to.
-        boft_dropout (`float`): The multiplicative dropout probability for BOFT layers.
+        boft_dropout (`float`):
+            The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the
+            dropout layer in LoRA.
         fan_in_fan_out (`bool`): Set this to True if the layer to replace stores weight like (fan_in, fan_out).
             For example, gpt-2 uses `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set
             to `True`.
@@ -81,7 +83,12 @@ class BOFTConfig(PeftConfig):
             "example": "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' ",
         },
     )
-    boft_dropout: float = field(default=0.0, metadata={"help": "BOFT multiplicative dropout"})
+    boft_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "BOFT multiplicative dropout, randomly setting blocks of OFT to be identity matrix, similar to the dropout layer in LoRA."
+        },
+    )
     fan_in_fan_out: bool = field(
         default=False,
         metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
@@ -125,9 +132,10 @@ def __post_init__(self):
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
         if self.boft_block_size == 0 and self.boft_block_num == 0:
-            raise ValueError("You must specify either boft_block_size or boft_block_num.")
+            raise ValueError(
+                f"Either `boft_block_size` or `boft_block_num` must be non-zero. Currently, boft_block_size = {self.boft_block_size} and boft_block_num = {self.boft_block_num}."
+            )
         if not (self.boft_block_size != 0) ^ (self.boft_block_num != 0):
             raise ValueError(
-                f"You can only specify either boft_block_size ({self.boft_block_size}) or boft_block_num ({self.boft_block_num}), "
-                "but not both simultaneously, because boft_block_size x boft_block_num != in_features."
+                f"You can only specify either boft_block_size ({self.boft_block_size}) or boft_block_num ({self.boft_block_num}), but not both simultaneously, because boft_block_size x boft_block_num == in_features."
             )
diff --git a/src/peft/tuners/boft/layer.py b/src/peft/tuners/boft/layer.py
index 0ab886a5e5..df99ac1bbf 100644
--- a/src/peft/tuners/boft/layer.py
+++ b/src/peft/tuners/boft/layer.py
@@ -324,8 +324,7 @@ def update_layer(
 
         else:
             raise ValueError(
-                f"You can only specify either boft_block_size ({boft_block_size}) or boft_block_num ({boft_block_num}), but not both simultaneously or setting both"
-                "to be 0, because boft_block_size x boft_block_num != in_features."
+                "Something went wrong, please report this error: https://github.com/huggingface/peft/issues"
             )
 
         # In OFT you can specify the number of blocks to be 1
@@ -710,11 +709,6 @@ def update_layer(
         conv_filter_dim = self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0]
 
         # Initialize the BOFT parameters.
-        if not (boft_block_size != 0) ^ (boft_block_num != 0):
-            raise ValueError(
-                f"You can only specify either boft_block_size ({boft_block_size}) or boft_block_num ({boft_block_num}), but not both simultaneously, because boft_block_size x boft_block_num != in_features."
-            )
-
         if boft_block_size == 0 and boft_block_num != 0:
             if conv_filter_dim % boft_block_num != 0:
                 raise ValueError(
@@ -752,7 +746,9 @@ def update_layer(
             boft_block_num = int(conv_filter_dim // boft_block_size)
 
         else:
-            raise ValueError("Unknown error!")
+            raise ValueError(
+                "Something went wrong, please report this error: https://github.com/huggingface/peft/issues"
+            )
 
         # In OFT you can specify the number of blocks to be 1
         if boft_n_butterfly_factor != 0:
@@ -776,7 +772,7 @@ def update_layer(
         self.boft_R[adapter_name] = nn.Parameter(
             torch.zeros(boft_n_butterfly_factor + 1, boft_block_num, boft_block_size, boft_block_size)
         )
-        self.boft_s[adapter_name] = nn.Parameter(torch.ones(1, int(self.out_features)))
+        self.boft_s[adapter_name] = nn.Parameter(torch.ones(int(self.out_features), 1))
 
         self.reset_boft_parameters(adapter_name, init_weights)
 
@@ -815,9 +811,11 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N
                     butterfly_oft_mat, boft_s = self.get_delta_weight(active_adapter)
 
                     orig_weight = orig_weight.view(
-                        self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0], self.out_features
+                        self.out_features, self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0]
                     )
+                    orig_weight = torch.transpose(orig_weight, 0, 1)
                     orig_weight = torch.mm(butterfly_oft_mat, orig_weight)
+                    orig_weight = torch.transpose(orig_weight, 0, 1)
                     orig_weight = orig_weight * boft_s
                     orig_weight = orig_weight.view(
                         self.out_features, self.in_features, base_layer.kernel_size[0], base_layer.kernel_size[0]
@@ -829,9 +827,11 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N
 
                     orig_weight = base_layer.weight.data.clone()
                     orig_weight = orig_weight.view(
-                        self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0], self.out_features
+                        self.out_features, self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0]
                     )
+                    orig_weight = torch.transpose(orig_weight, 0, 1)
                     orig_weight = torch.mm(butterfly_oft_mat, orig_weight)
+                    orig_weight = torch.transpose(orig_weight, 0, 1)
                     orig_weight = orig_weight * boft_s
                     orig_weight = orig_weight.view(
                         self.out_features, self.in_features, base_layer.kernel_size[0], base_layer.kernel_size[0]
@@ -855,10 +855,12 @@ def unmerge(self) -> None:
 
                 orig_weight = self.get_base_layer().weight.data.clone()
                 orig_weight = orig_weight.view(
-                    self.in_features * self.get_base_layer().kernel_size[0] * self.get_base_layer().kernel_size[0],
                     self.out_features,
+                    self.in_features * self.get_base_layer().kernel_size[0] * self.get_base_layer().kernel_size[0],
                 )
+                orig_weight = torch.transpose(orig_weight, 0, 1)
                 orig_weight = torch.mm(butterfly_oft_mat.t(), orig_weight)
+                orig_weight = torch.transpose(orig_weight, 0, 1)
                 orig_weight = orig_weight * (1 / boft_s)
                 orig_weight = orig_weight.view(
                     self.out_features,
@@ -917,7 +919,7 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
                 device=x.device,
                 dtype=x.dtype,
             )
-            boft_scale = torch.ones((1, int(self.out_features)), device=x.device, dtype=x.dtype)
+            boft_scale = torch.ones((int(self.out_features), 1), device=x.device, dtype=x.dtype)
 
             for active_adapter in self.active_adapters:
                 if active_adapter not in self.boft_R.keys():
@@ -954,10 +956,12 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
 
             orig_weight = self.base_layer.weight.data
             orig_weight = orig_weight.view(
-                self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0],
                 self.out_features,
+                self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0],
             )
+            orig_weight = torch.transpose(orig_weight, 0, 1)
             rotated_weight = torch.mm(boft_rotation, orig_weight)
+            rotated_weight = torch.transpose(rotated_weight, 0, 1)
 
             scaled_rotated_weight = rotated_weight * boft_scale
 
diff --git a/src/peft/tuners/oft/config.py b/src/peft/tuners/oft/config.py
index ba3b9a4401..13a6b5d7ce 100644
--- a/src/peft/tuners/oft/config.py
+++ b/src/peft/tuners/oft/config.py
@@ -12,22 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Literal, Optional, Union
 
-from peft.tuners.lycoris_utils import LycorisConfig
+from peft.config import PeftConfig
 from peft.utils import PeftType
 
 
 @dataclass
-class OFTConfig(LycorisConfig):
+class OFTConfig(PeftConfig):
     """
     This is the configuration class to store the configuration of a [`OFTModel`].
 
     Args:
-        r (`int`): OFT rank.
-        module_dropout (`int`): The dropout probability for disabling OFT modules during training.
-        target_modules (`Optional[Union[List[str], str]]`):
+        r (`int`): OFT rank, number of OFT blocks per injected layer.
+        oft_block_size (`int`): OFT block size across different layers.
+        module_dropout (`float`):
+            The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the
+            dropout layer in LoRA.
+        target_modules (`Optional[Union[list[str], str]]`):
             The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
             names will be replaced. When passing a string, a regex match will be performed. When passing a list of
             strings, either an exact match will be performed or it is checked if the name of the module ends with any
@@ -35,6 +40,10 @@ class OFTConfig(LycorisConfig):
             the output layer. If this is not specified, modules will be chosen according to the model architecture. If
             the architecture is not known, an error will be raised -- in this case, you should specify the target
             modules manually.
+        fan_in_fan_out (`bool`): Set this to True if the layer to replace stores weight like (fan_in, fan_out).
+        bias (`str`): Bias type for OFT. Can be 'none', 'all' or 'oft_only'. If 'all' or 'oft_only', the
+            corresponding biases will be updated during training. Be aware that this means that, even when disabling
+            the adapters, the model will not produce the same output as the base model would have without adaptation.
         init_weights (`bool`):
             Whether to perform initialization of OFT weights.
         layers_to_transform (`Union[List[int], int]`):
@@ -56,11 +65,21 @@ class OFTConfig(LycorisConfig):
             Whether to share the OFT parameters between blocks or not. This is `False` by default.
     """
 
-    r: int = field(default=8, metadata={"help": "OFT rank"})
+    r: int = field(default=8, metadata={"help": "OFT rank, number of OFT blocks per injected layer."})
+    oft_block_size: int = field(
+        default=0,
+        metadata={
+            "help": "OFT block size across different layers.",
+            "note": "You can only specify either r or oft_block_size, but not both simultaneously, because r x oft_block_size = layer dimension.",
+        },
+    )
     module_dropout: float = field(
-        default=0.0, metadata={"help": "The dropout probability for disabling OFT modules during training"}
+        default=0.0,
+        metadata={
+            "help": "OFT multiplicative dropout, randomly setting blocks of OFT to be identity matrix, similar to the dropout layer in LoRA."
+        },
     )
-    target_modules: Optional[Union[List[str], str]] = field(
+    target_modules: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
             "help": "List of module names or regex expression of the module names to replace with OFT."
@@ -68,6 +87,13 @@ class OFTConfig(LycorisConfig):
             "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
         },
     )
+    fan_in_fan_out: bool = field(
+        default=False,
+        metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
+    )
+    bias: Literal["none", "all", "oft_only"] = field(
+        default="none", metadata={"help": "Bias type for OFT. Can be 'none', 'all' or 'oft_only'"}
+    )
     init_weights: bool = field(
         default=True,
         metadata={
@@ -77,7 +103,7 @@ class OFTConfig(LycorisConfig):
             ),
         },
     )
-    layers_to_transform: Optional[Union[List[int], int]] = field(
+    layers_to_transform: Optional[Union[list[int], int]] = field(
         default=None,
         metadata={
             "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
@@ -89,7 +115,7 @@ class OFTConfig(LycorisConfig):
             "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
         },
     )
-    modules_to_save: Optional[List[str]] = field(
+    modules_to_save: Optional[list[str]] = field(
         default=None,
         metadata={
             "help": "List of modules apart from OFT layers to be set as trainable and saved in the final checkpoint. "
@@ -111,9 +137,54 @@ class OFTConfig(LycorisConfig):
         default=False,
         metadata={"help": "Whether to share the OFT parameters between blocks or not."},
     )
+    rank_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}"
+                "Important: the rank pattern won't be applied to the layers after 0.12.1.dev0!"
+            )
+        },
+    )
+    alpha_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `alpha`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}"
+                "Important: the alpha pattern won't be applied to the layers after 0.12.1.dev0!"
+            )
+        },
+    )
 
     def __post_init__(self):
         self.peft_type = PeftType.OFT
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        if self.r == 0 and self.oft_block_size == 0:
+            raise ValueError(
+                f"Either `r` or `oft_block_size` must be non-zero. Currently, r = {self.r} and oft_block_size = {self.oft_block_size}."
+            )
+        if not (self.r != 0) ^ (self.oft_block_size != 0):
+            raise ValueError(
+                f"You can only specify either r ({self.r}) or oft_block_size ({self.oft_block_size}), but not both simultaneously, because r x oft_block_size == in_features."
+            )
+
+    @classmethod
+    def check_kwargs(cls, **kwargs):
+        r"""
+        Check if the kwargs are valid for the configuration.
+
+        Args:
+            kwargs (additional keyword arguments, *optional*):
+                Additional keyword arguments passed along to the child class initialization.
+        """
+        if "oft_block_size" not in kwargs:
+            raise ValueError(
+                "OFT has been updated since PEFT 0.14.0. Your trained adapter weights are incompatible "
+                "with the latest version of OFT. Please retrain your adapter weights with newer PEFT versions. "
+                "Alternatively, downgrade PEFT to version 0.13.0 to use the old adapter weights."
+            )
+        return super().check_kwargs(**kwargs)
diff --git a/src/peft/tuners/oft/layer.py b/src/peft/tuners/oft/layer.py
index 965f2e83ff..7d58a8c023 100644
--- a/src/peft/tuners/oft/layer.py
+++ b/src/peft/tuners/oft/layer.py
@@ -11,111 +11,319 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import math
 import warnings
-from typing import Any, List, Optional, Set, Tuple
+from typing import Any, Optional, Union
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
-from peft.tuners.lycoris_utils import LycorisLayer, check_adapters_to_merge
+from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
 
 
-class OFTLayer(nn.Module, LycorisLayer):
+class MultiplicativeDropoutLayer(nn.Module):
+    """
+    Implements the multiplicative dropout layer for OFT.
+    """
+
+    def __init__(self, p=0.0):
+        """
+        Initializes the multiplicative dropout layer.
+
+        Parameters:
+        p (float): The probability of dropping out a block. Defaults to 0.0.
+        """
+        super().__init__()
+        self.p = p
+
+    def forward(self, x):
+        """
+        Applies multiplicative dropout to the input tensor.
+
+        Parameters:
+        x (Tensor): The input tensor of shape (D, H, H), where `D` represents
+                    the number of OFT blocks, and `H` is the size of the square blocks along the last two dimensions,
+                    the block size in OFT.
+        """
+        if self.training:
+            # Ensure the last two dimensions are the same
+            if x.shape[-1] != x.shape[-2]:
+                raise ValueError("The last two dimensions of input should be the same!")
+
+            D, H, _ = x.shape
+
+            # If block share, skip the multiplicative dropout
+            if D == 1:
+                return x
+
+            num_to_replace = int(self.p * D)
+            num_zeros = D - num_to_replace
+            mask = torch.cat([torch.ones(num_to_replace, device=x.device), torch.zeros(num_zeros, device=x.device)])
+            mask = mask[torch.randperm(D)].view(D, 1, 1)
+            eye_matrix = torch.eye(H, device=x.device).repeat(D, 1, 1)
+            x = (1 - mask) * x + mask * eye_matrix
+        return x
+
+
+class OFTLayer(BaseTunerLayer):
+    """
+    Implements the OFT layer.
+    """
+
     # All names of layers that may contain adapter weights
-    adapter_layer_names = ("oft_r",)
+    adapter_layer_names = ("oft_r", "oft_s")
     # other_param_names is defined on parent class
+    other_param_names = ("r", "oft_block_size", "oft_dropout")
 
-    def __init__(self, base_layer: nn.Module):
-        super().__init__()
-        LycorisLayer.__init__(self, base_layer)
+    def __init__(self, base_layer: nn.Module, **kwargs) -> None:
+        """
+        Initializes the OFT layer.
+
+        Note, currently only support linear layer and convolutional layer, with further support for other layers to be
+        added soon.
 
+        Parameters:
+        base_layer: the pretrained model layer
+        """
+        self.base_layer = base_layer
         # OFT info
         self.oft_r = nn.ParameterDict({})
+        self.oft_s = nn.ParameterDict({})
+        self.r = {}
+        self.oft_block_size = {}
+        self.oft_dropout = nn.ModuleDict({})
         self.coft = {}
         self.eps = {}
         self.block_share = {}
+        # Mark the weight as unmerged
+        self._disable_adapters = False
+        self.merged_adapters = []
+        self.kwargs = kwargs
+
+        base_layer = self.get_base_layer()
+
+        if isinstance(base_layer, nn.Linear):
+            in_features, out_features = base_layer.in_features, base_layer.out_features
+        elif isinstance(base_layer, nn.Conv2d):
+            in_features, out_features = base_layer.in_channels, base_layer.out_channels
+        else:
+            raise ValueError(f"Unsupported layer type {type(base_layer)}")
+
+        self.in_features = in_features
+        self.out_features = out_features
 
     @property
-    def _available_adapters(self) -> Set[str]:
+    def _available_adapters(self) -> set[str]:
         return {*self.oft_r}
 
-    def create_adapter_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ...], block_share: bool):
-        if block_share:
-            self.oft_r[adapter_name] = nn.Parameter(torch.empty(1, math.ceil(shape[0] / r), math.ceil(shape[0] / r)))
-        else:
-            self.oft_r[adapter_name] = nn.Parameter(torch.empty(r, math.ceil(shape[0] / r), math.ceil(shape[0] / r)))
+    def set_scale(self, adapter, scale):
+        if adapter not in self.scaling:
+            # Ignore the case where the adapter is not in the layer
+            return
+
+        warnings.warn("Scaling operation for OFT not supported! Automatically set scale to 1.")
 
-    def reset_adapter_parameters(self, adapter_name: str):
-        nn.init.zeros_(self.oft_r[adapter_name])
+    def scale_layer(self, scale: float) -> None:
+        if scale == 1:
+            return
 
-    def reset_adapter_parameters_random(self, adapter_name: str):
-        nn.init.kaiming_uniform_(self.oft_r[adapter_name], a=math.sqrt(5))
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.oft_r.keys():
+                continue
 
-    def update_layer(
-        self,
-        adapter_name: str,
-        r: int,
-        module_dropout: float,
-        init_weights: bool,
-        coft: bool = False,
-        eps: float = 6e-5,
-        block_share: bool = False,
-        **kwargs,
-    ) -> None:
+            warnings.warn("Scaling operation for OFT not supported! Automatically set scale to 1.")
+
+    def unscale_layer(self, scale=None) -> None:
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.oft_r.keys():
+                continue
+
+            warnings.warn("Unscaling operation for OFT not supported! Keeping scale to 1.")
+
+    def update_layer(self, adapter_name, r, oft_block_size, module_dropout, coft, eps, block_share, init_weights):
+        """
+        Update the linear layer with trainable OFT weights. Override for other layer types.
+        """
         """Internal function to create oft adapter
 
         Args:
             adapter_name (`str`): Name for the adapter to add.
             r (`int`): Rank for the added adapter.
-            module_dropout (`float`): The dropout probability for disabling adapter during training.
-            init_weights (`bool`): Whether to initialize weights.
+            oft_block_size (`int`): The block size for added adapter.
+            module_dropout (`float`):
+                The multiplicative dropout probability for disabling adapter blocks during training.
             coft (`bool`): Whether to use the constrained variant of OFT or not.
             eps (`float`):
                 The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True.
             block_share (`bool`): Whether to share the OFT parameters between blocks or not.
+            init_weights (`bool`): Whether to initialize weights.
         """
-        if r <= 0:
-            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+        # Initialize the MultiplicativeDropoutLayer for module_dropout > 0.0.
+        if module_dropout > 0.0:
+            oft_dropout_layer = MultiplicativeDropoutLayer(p=module_dropout)
+        else:
+            oft_dropout_layer = nn.Identity()
+        self.oft_dropout.update(nn.ModuleDict({adapter_name: oft_dropout_layer}))
+
+        if r == 0 and oft_block_size != 0:
+            if self.in_features % oft_block_size != 0 or oft_block_size > self.in_features:
+                old_oft_block_size = oft_block_size
+                oft_block_size = self.adjust_oft_parameters(self.in_features, oft_block_size)
+                warnings.warn(
+                    f"Invalid `oft_block_size` ({old_oft_block_size})! Adjusted `oft_block_size` to ({oft_block_size})."
+                )
+            r = int(self.in_features // oft_block_size)
+        elif r != 0 and oft_block_size == 0:
+            if self.in_features % r != 0 or r > self.in_features:
+                old_r = r
+                r = self.adjust_oft_parameters(self.in_features, r)
+                warnings.warn(f"Invalid `r` ({old_r})! Adjusted `r` to ({r}).")
+            oft_block_size = int(self.in_features // r)
+        else:
+            raise ValueError(
+                "Something went wrong, please report this error: https://github.com/huggingface/peft/issues"
+            )
 
-        self.r[adapter_name] = r
-        self.module_dropout[adapter_name] = module_dropout
         self.coft[adapter_name] = coft
         self.block_share[adapter_name] = block_share
+        self.eps[adapter_name] = eps * math.ceil(self.out_features / r) * math.ceil(self.out_features / r)
 
-        # Determine shape of OFT weights
-        base_layer = self.get_base_layer()
-        if isinstance(base_layer, nn.Linear):
-            shape = tuple(base_layer.weight.shape)
-        elif isinstance(base_layer, nn.Conv2d):
-            shape = (
-                base_layer.out_channels,
-                base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+        # Create weights with provided shape
+        if block_share:
+            self.oft_r[adapter_name] = nn.Parameter(
+                torch.empty(1, math.ceil(self.in_features / r), math.ceil(self.in_features / r))
             )
         else:
-            raise TypeError(f"OFT is not implemented for base layers of type {type(base_layer).__name__}")
-
-        self.eps[adapter_name] = eps * math.ceil(shape[0] / r) * math.ceil(shape[0] / r)
-
-        # Create weights with provided shape
-        self.create_adapter_parameters(adapter_name, r, shape, block_share)
+            self.oft_r[adapter_name] = nn.Parameter(
+                torch.empty(r, math.ceil(self.in_features / r), math.ceil(self.in_features / r))
+            )
+        self.oft_s[adapter_name] = nn.Parameter(torch.empty(int(self.out_features), 1))
 
         # Initialize weights
-        if init_weights:
-            self.reset_adapter_parameters(adapter_name)
-        else:
-            self.reset_adapter_parameters_random(adapter_name)
+        self.reset_oft_parameters(adapter_name, init_weights)
+
+        # set oft r and block size
+        self.r[adapter_name] = r
+        self.oft_block_size[adapter_name] = oft_block_size
 
         # Move new weights to device
         self._move_adapter_to_device_of_base_layer(adapter_name)
         self.set_adapter(self.active_adapters)
 
-    def unscale_layer(self, scale=None) -> None:
-        # scale is not used
-        pass
+    def reset_oft_parameters(self, adapter_name, init_weights):
+        """
+        Reset the OFT parameters.
+        """
+        if init_weights is False:
+            nn.init.normal_(self.oft_r[adapter_name], mean=0.0, std=0.1)
+            nn.init.normal_(self.oft_s[adapter_name], mean=1.0, std=0.1)
+            return
+
+        if adapter_name in self.oft_r.keys():
+            if init_weights is True:
+                # initialize oft_r to zero
+                nn.init.zeros_(self.oft_r[adapter_name])
+                nn.init.ones_(self.oft_s[adapter_name])
+            else:
+                raise ValueError(f"Unknown initialization {init_weights=}")
+
+    def _cayley_batch(self, data: torch.Tensor) -> torch.Tensor:
+        """
+        Perform the Cayley parametrization on a batch of skew-symmetric matrices.
+
+        Args:
+            data: A batch of skew-symmetric matrices of shape (b, r, c).
+        """
+        b, r, c = data.shape
+        # Ensure the input matrix is skew-symmetric
+        skew_mat = 0.5 * (data - data.transpose(1, 2))
+        id_mat = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c)  # noqa: E741
+
+        # Perform the Cayley parametrization
+        Q = torch.linalg.solve(id_mat + skew_mat, id_mat - skew_mat, left=False)
+
+        return Q
+
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L155
+    def _block_diagonal(self, oft_r: torch.Tensor, rank: int) -> torch.Tensor:
+        if oft_r.shape[0] == 1:
+            # block share
+            blocks = [oft_r[0, ...] for i in range(rank)]
+        else:
+            blocks = [oft_r[i, ...] for i in range(rank)]
+
+        # Use torch.block_diag to create the block diagonal matrix
+        A = torch.block_diag(*blocks)
+
+        return A
+
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L52
+    def _project_batch(self, oft_r, eps=1e-5):
+        # scaling factor for each of the smaller block matrix
+        eps = eps * 1 / torch.sqrt(torch.tensor(oft_r.shape[0]))
+        I = (  # noqa: E741
+            torch.zeros((oft_r.size(1), oft_r.size(1)), device=oft_r.device, dtype=oft_r.dtype)
+            .unsqueeze(0)
+            .expand_as(oft_r)
+        )
+        diff = oft_r - I
+        norm_diff = torch.norm(oft_r - I, dim=(1, 2), keepdim=True)
+        mask = (norm_diff <= eps).bool()
+        out = torch.where(mask, oft_r, I + eps * (diff / norm_diff))
+        return out
+
+    def adjust_oft_parameters(self, in_features, params):
+        """
+        Adjust the OFT parameters to be divisible by the in_features dimension.
+        """
+        if params < in_features:
+            higher_params = params
+            while higher_params <= in_features and in_features % higher_params != 0:
+                higher_params += 1
+        else:
+            return in_features
+
+        lower_params = params
+        while lower_params > 1 and in_features % lower_params != 0:
+            lower_params -= 1
+
+        if (params - lower_params) <= (higher_params - params):
+            return lower_params
+        else:
+            return higher_params
+
+
+class Linear(nn.Module, OFTLayer):
+    """OFT implemented in Linear layer"""
+
+    def __init__(
+        self,
+        base_layer,
+        adapter_name: str,
+        r: int = 8,
+        oft_block_size: int = 0,
+        module_dropout: float = 0.0,
+        coft: bool = False,
+        eps: float = 6e-5,
+        block_share: bool = False,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        init_weights: Union[bool, str] = True,
+        is_target_conv_1d_layer: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        OFTLayer.__init__(self, base_layer, **kwargs)
+        self.fan_in_fan_out = fan_in_fan_out
+
+        self._active_adapter = adapter_name
 
-    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        self.update_layer(adapter_name, r, oft_block_size, module_dropout, coft, eps, block_share, init_weights)
+        self.is_target_conv_1d_layer = is_target_conv_1d_layer
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
         """
         Merge the active adapter weights into the base weights
 
@@ -136,42 +344,32 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
         for active_adapter in adapter_names:
             if active_adapter in self._available_adapters:
                 base_layer = self.get_base_layer()
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weights = base_layer.weight.data
+                    oft_mat, oft_s = self.get_delta_weight(active_adapter)
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                    orig_weights = torch.mm(oft_mat, orig_weights)
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                    orig_weights = orig_weights * oft_s
 
-                orig_weights = base_layer.weight.data
-                if isinstance(base_layer, nn.Linear):
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    base_layer.weight.data = orig_weights.contiguous()
+                else:
+                    oft_mat, oft_s = self.get_delta_weight(active_adapter)
+                    orig_weights = base_layer.weight.data
                     orig_weights = torch.transpose(orig_weights, 0, 1)
-                elif isinstance(base_layer, nn.Conv2d):
-                    orig_weights = orig_weights.view(
-                        [
-                            base_layer.out_channels,
-                            base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
-                        ]
-                    )
+                    orig_weights = torch.mm(oft_mat, orig_weights)
                     orig_weights = torch.transpose(orig_weights, 0, 1)
-                delta_weight = self.get_delta_weight(active_adapter)
-                if orig_weights.shape[1] != delta_weight.shape[1]:
-                    # when in channels is not divisible by r
-                    delta_weight = delta_weight[: orig_weights.shape[1], : orig_weights.shape[1]]
-                new_weights = torch.mm(orig_weights, delta_weight)
-                if isinstance(base_layer, nn.Linear):
-                    new_weights = torch.transpose(new_weights, 0, 1)
-                elif isinstance(base_layer, nn.Conv2d):
-                    new_weights = torch.transpose(new_weights, 0, 1)
-                    new_weights = new_weights.view(
-                        [
-                            base_layer.out_channels,
-                            base_layer.in_channels,
-                            base_layer.kernel_size[0],
-                            base_layer.kernel_size[1],
-                        ]
-                    )
+                    orig_weights = orig_weights * oft_s
 
-                if safe_merge and not torch.isfinite(new_weights).all():
-                    raise ValueError(
-                        f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-                    )
+                    base_layer.weight.data = orig_weights.contiguous()
 
-                base_layer.weight.data = new_weights.contiguous()
                 self.merged_adapters.append(active_adapter)
 
     def unmerge(self) -> None:
@@ -183,94 +381,39 @@ def unmerge(self) -> None:
             return
         while len(self.merged_adapters) > 0:
             active_adapter = self.merged_adapters.pop()
-            if active_adapter in self._available_adapters:
-                base_layer = self.get_base_layer()
-                new_weights = base_layer.weight.data
-                if isinstance(base_layer, nn.Linear):
-                    new_weights = torch.transpose(new_weights, 0, 1)
-                elif isinstance(base_layer, nn.Conv2d):
-                    new_weights = new_weights.view(
-                        [
-                            base_layer.out_channels,
-                            base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
-                        ]
-                    )
-                    new_weights = torch.transpose(new_weights, 0, 1)
-                delta_weight = self.get_delta_weight(active_adapter)
-                if new_weights.shape[1] != delta_weight.shape[1]:
-                    # when in channels is not divisible by r
-                    delta_weight = delta_weight[: new_weights.shape[1], : new_weights.shape[1]]
-                delta_inv = torch.inverse(delta_weight)
-                orig_weights = torch.mm(new_weights, delta_inv)
-
-                if isinstance(base_layer, nn.Linear):
-                    orig_weights = torch.transpose(orig_weights, 0, 1)
-                elif isinstance(base_layer, nn.Conv2d):
-                    orig_weights = torch.transpose(orig_weights, 0, 1)
-                    orig_weights = orig_weights.reshape(
-                        [
-                            base_layer.out_channels,
-                            base_layer.in_channels,
-                            base_layer.kernel_size[0],
-                            base_layer.kernel_size[1],
-                        ]
-                    )
-                base_layer.weight.data = orig_weights.contiguous()
+            if active_adapter in self.oft_r.keys():
+                oft_mat, oft_s = self.get_delta_weight(active_adapter)
+
+                orig_weights = self.get_base_layer().weight.data
+                orig_weights = torch.transpose(orig_weights, 0, 1)
+                orig_weights = torch.mm(oft_mat.t(), orig_weights)
+                orig_weights = torch.transpose(orig_weights, 0, 1)
+
+                self.get_base_layer().weight.data = orig_weights * (1 / oft_s)
+
+    def get_delta_weight(self, adapter_name) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Compute the delta weight for the given adapter.
+
+        Args:
+            adapter (str):
+                The name of the adapter for which the delta weight should be computed.
+        """
+        oft_r = self.oft_r[adapter_name]
+        oft_s = self.oft_s[adapter_name]
 
-    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
         rank = self.r[adapter_name]
         coft = self.coft[adapter_name]
         eps = self.eps[adapter_name]
-        opt_r = self.oft_r[adapter_name]
 
         if coft:
             with torch.no_grad():
-                opt_r.copy_(self._project_batch(opt_r, eps=eps))
+                oft_r.copy_(self._project_batch(oft_r, eps=eps))
 
-        orth_rotate = self._cayley_batch(opt_r)
+        orth_rotate = self._cayley_batch(oft_r)
         weight = self._block_diagonal(orth_rotate, rank)
 
-        return weight
-
-    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L144
-    def _cayley_batch(self, data: torch.Tensor) -> torch.Tensor:
-        b, r, c = data.shape
-        # Ensure the input matrix is skew-symmetric
-        skew = 0.5 * (data - data.transpose(1, 2))
-        I = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c)  # noqa: E741
-
-        # Perform the Cayley parametrization
-        Q = torch.bmm(I - skew, torch.inverse(I + skew))
-
-        return Q
-
-    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L155
-    def _block_diagonal(self, oft_r: torch.Tensor, rank: int) -> torch.Tensor:
-        if oft_r.shape[0] == 1:
-            # block share
-            blocks = [oft_r[0, ...] for i in range(rank)]
-        else:
-            blocks = [oft_r[i, ...] for i in range(rank)]
-
-        # Use torch.block_diag to create the block diagonal matrix
-        A = torch.block_diag(*blocks)
-
-        return A
-
-    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L52
-    def _project_batch(self, oft_r, eps=1e-5):
-        # scaling factor for each of the smaller block matrix
-        eps = eps * 1 / torch.sqrt(torch.tensor(oft_r.shape[0]))
-        I = (  # noqa: E741
-            torch.zeros((oft_r.size(1), oft_r.size(1)), device=oft_r.device, dtype=oft_r.dtype)
-            .unsqueeze(0)
-            .expand_as(oft_r)
-        )
-        diff = oft_r - I
-        norm_diff = torch.norm(oft_r - I, dim=(1, 2), keepdim=True)
-        mask = (norm_diff <= eps).bool()
-        out = torch.where(mask, oft_r, I + eps * (diff / norm_diff))
-        return out
+        return weight, oft_s
 
     def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
         previous_dtype = x.dtype
@@ -282,100 +425,322 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
         elif self.merged:
             result = self.base_layer(x, *args, **kwargs)
         else:
-            result = self.base_layer(x, *args, **kwargs)
-            if len(result.shape) == 4:
-                result = result.permute(0, 2, 3, 1)
+            oft_rotation = torch.eye(self.in_features, device=x.device, dtype=previous_dtype)
+            oft_scale = torch.ones((int(self.out_features), 1), device=x.device, dtype=previous_dtype)
 
-            base_layer = self.get_base_layer()
-            base_bias = base_layer.bias
-            if base_bias is not None:
-                # Bias should be added after OFT forward
-                result = result - base_bias.data
-
-            # Execute all the adapters
             for active_adapter in self.active_adapters:
-                if active_adapter not in self._available_adapters:
+                if active_adapter not in self.oft_r.keys():
                     continue
+                oft_r = self.oft_r[active_adapter]
+                oft_s = self.oft_s[active_adapter]
+                dropout = self.oft_dropout[active_adapter]
 
-                module_dropout = self.module_dropout[active_adapter]
-
-                # Modify current execution weights
-                if (not self.training) or (self.training and torch.rand(1) > module_dropout):
-                    result = self._get_delta_activations(active_adapter, result, *args, **kwargs)
+                rank = self.r[active_adapter]
+                coft = self.coft[active_adapter]
+                eps = self.eps[active_adapter]
 
-            if base_bias is not None:
-                result = result + base_bias.data
-            if len(result.shape) == 4:
-                result = result.permute(0, 3, 1, 2)
+                if coft:
+                    with torch.no_grad():
+                        oft_r.copy_(self._project_batch(oft_r, eps=eps))
 
-        result = result.to(previous_dtype)
-        return result
+                orth_rotate = self._cayley_batch(oft_r)
+                orth_rotate = dropout(orth_rotate)
+                oft_mat = self._block_diagonal(orth_rotate, rank)
 
+                oft_rotation = oft_mat @ oft_rotation
+                oft_scale = oft_s * oft_scale
 
-class Linear(OFTLayer):
-    """OFT implemented in Linear layer"""
-
-    def __init__(
-        self,
-        base_layer: nn.Module,
-        adapter_name: str = "default",
-        r: int = 0,
-        module_dropout: float = 0.0,
-        init_weights: bool = True,
-        **kwargs,
-    ):
-        super().__init__(base_layer)
+            x = x.to(self.get_base_layer().weight.data.dtype)
 
-        # Create adapter and set it active
-        self._active_adapter = adapter_name
-        self.update_layer(adapter_name, r, module_dropout, init_weights, **kwargs)
+            orig_weight = self.get_base_layer().weight.data
+            orig_weight = torch.transpose(orig_weight, 0, 1)
+            oft_rotation = oft_rotation.to(previous_dtype)
+            orig_weight = orig_weight.to(previous_dtype)
+            rotated_weight = torch.mm(oft_rotation, orig_weight)
+            rotated_weight = torch.transpose(rotated_weight, 0, 1)
 
-    def _get_delta_activations(
-        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
-    ) -> torch.Tensor:
-        delta_weight = self.get_delta_weight(adapter_name)
+            scaled_rotated_weight = rotated_weight * oft_scale
 
-        base_layer = self.get_base_layer()
-        base_weight = base_layer.weight.data
-        delta_weight = delta_weight[: base_weight.shape[0], : base_weight.shape[0]]
+            scaled_rotated_weight = scaled_rotated_weight.to(previous_dtype)
+            bias = self.get_base_layer().bias.to(previous_dtype) if self.get_base_layer().bias is not None else None
+            result = F.linear(input=x, weight=scaled_rotated_weight, bias=bias)
 
-        # don't add bias here, because the bias will be added after OFT forward
-        return torch.matmul(input, delta_weight)
+        result = result.to(previous_dtype)
+        return result
 
     def __repr__(self) -> str:
         rep = super().__repr__()
         return "oft." + rep
 
 
-class Conv2d(OFTLayer):
+class Conv2d(nn.Module, OFTLayer):
     """OFT implemented in Conv2d layer"""
 
     def __init__(
         self,
         base_layer: nn.Module,
-        adapter_name: str = "default",
-        r: int = 0,
+        adapter_name: str,
+        r: int = 8,
+        oft_block_size: int = 0,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
         module_dropout: float = 0.0,
-        init_weights: bool = True,
+        coft: bool = False,
+        eps: float = 6e-5,
+        block_share: bool = False,
+        init_weights: Union[bool, str] = True,
         **kwargs,
-    ):
-        super().__init__(base_layer)
+    ) -> None:
+        super().__init__()
+        OFTLayer.__init__(self, base_layer)
+        self.fan_in_fan_out = fan_in_fan_out
 
-        # Create adapter and set it active
         self._active_adapter = adapter_name
-        self.update_layer(adapter_name, r, module_dropout, init_weights, **kwargs)
 
-    def _get_delta_activations(
-        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
-    ) -> torch.Tensor:
-        delta_weight = self.get_delta_weight(adapter_name)
+        # Create adapter and set it active
+        self.update_layer(adapter_name, r, oft_block_size, module_dropout, coft, eps, block_share, init_weights)
+
+    def update_layer(self, adapter_name, r, oft_block_size, module_dropout, coft, eps, block_share, init_weights):
+        """
+        Update the conv2d layer with trainable OFT weights.
+        """
+        # Initialize the MultiplicativeDropoutLayer for module_dropout > 0.0.
+        if module_dropout > 0.0:
+            oft_dropout_layer = MultiplicativeDropoutLayer(p=module_dropout)
+        else:
+            oft_dropout_layer = nn.Identity()
+        self.oft_dropout.update(nn.ModuleDict({adapter_name: oft_dropout_layer}))
 
+        # layer information from the base layer
         base_layer = self.get_base_layer()
-        base_weight = base_layer.weight.data
-        delta_weight = delta_weight[: base_weight.shape[0], : base_weight.shape[0]]
+        conv_filter_dim = self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0]
+
+        if r == 0 and oft_block_size != 0:
+            if conv_filter_dim % oft_block_size != 0 or oft_block_size > conv_filter_dim:
+                old_oft_block_size = oft_block_size
+                oft_block_size = self.adjust_oft_parameters(conv_filter_dim, oft_block_size)
+                warnings.warn(
+                    f"Invalid `oft_block_size` ({old_oft_block_size})! Adjusted `oft_block_size` to ({oft_block_size})."
+                )
+            r = int(conv_filter_dim // oft_block_size)
+        elif r != 0 and oft_block_size == 0:
+            if conv_filter_dim % r != 0 or r > conv_filter_dim:
+                old_r = r
+                r = self.adjust_oft_parameters(conv_filter_dim, r)
+                warnings.warn(f"Invalid `r` ({old_r})! Adjusted `r` to ({r}).")
+            oft_block_size = int(conv_filter_dim // r)
+        else:
+            raise ValueError(
+                "Something went wrong, please report this error: https://github.com/huggingface/peft/issues"
+            )
+
+        self.coft[adapter_name] = coft
+        self.block_share[adapter_name] = block_share
+        self.eps[adapter_name] = eps * math.ceil(self.out_features / r) * math.ceil(self.out_features / r)
+
+        # Create weights with provided shape
+        if block_share:
+            self.oft_r[adapter_name] = nn.Parameter(
+                torch.empty(1, math.ceil(conv_filter_dim / r), math.ceil(conv_filter_dim / r))
+            )
+        else:
+            self.oft_r[adapter_name] = nn.Parameter(
+                torch.empty(r, math.ceil(conv_filter_dim / r), math.ceil(conv_filter_dim / r))
+            )
+        self.oft_s[adapter_name] = nn.Parameter(torch.empty(int(self.out_features), 1))
+
+        # Initialize weights
+        self.reset_oft_parameters(adapter_name, init_weights)
+
+        # set oft r and block size
+        self.r[adapter_name] = r
+        self.oft_block_size[adapter_name] = oft_block_size
+
+        # Move new weights to device
+        self._move_adapter_to_device_of_base_layer(adapter_name)
+        self.set_adapter(self.active_adapters)
 
-        # don't add bias here, because the bias will be added after OFT forward
-        return torch.matmul(input, delta_weight)
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            if active_adapter in self.oft_r.keys():
+                base_layer = self.get_base_layer()
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weights = base_layer.weight.data.clone()
+                    oft_mat, oft_s = self.get_delta_weight(active_adapter)
+
+                    orig_weights = orig_weights.view(
+                        self.out_features, self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0]
+                    )
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                    orig_weights = torch.mm(oft_mat, orig_weights)
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                    orig_weights = orig_weights * oft_s
+                    orig_weights = orig_weights.view(
+                        self.out_features, self.in_features, base_layer.kernel_size[0], base_layer.kernel_size[0]
+                    )
+
+                    base_layer.weight.data = orig_weights.contiguous()
+                else:
+                    oft_mat, oft_s = self.get_delta_weight(active_adapter)
+
+                    orig_weights = base_layer.weight.data.clone()
+                    orig_weights = orig_weights.view(
+                        self.out_features, self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0]
+                    )
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                    orig_weights = torch.mm(oft_mat, orig_weights)
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                    orig_weights = orig_weights * oft_s
+                    orig_weights = orig_weights.view(
+                        self.out_features, self.in_features, base_layer.kernel_size[0], base_layer.kernel_size[0]
+                    )
+
+                    base_layer.weight.data = orig_weights.contiguous()
+
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.oft_r.keys():
+                oft_mat, oft_s = self.get_delta_weight(active_adapter)
+
+                orig_weights = self.get_base_layer().weight.data.clone()
+                orig_weights = orig_weights.view(
+                    self.out_features,
+                    self.in_features * self.get_base_layer().kernel_size[0] * self.get_base_layer().kernel_size[0],
+                )
+                orig_weights = torch.transpose(orig_weights, 0, 1)
+                orig_weights = torch.mm(oft_mat.t(), orig_weights)
+                orig_weights = torch.transpose(orig_weights, 0, 1)
+                orig_weights = orig_weights * (1 / oft_s)
+                orig_weights = orig_weights.view(
+                    self.out_features,
+                    self.in_features,
+                    self.get_base_layer().kernel_size[0],
+                    self.get_base_layer().kernel_size[0],
+                )
+
+                self.get_base_layer().weight.data = orig_weights
+
+    def get_delta_weight(self, adapter_name) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Compute the delta weight for the given adapter.
+
+        Args:
+            adapter (str):
+                The name of the adapter for which the delta weight should be computed.
+        """
+        oft_r = self.oft_r[adapter_name]
+        oft_s = self.oft_s[adapter_name]
+
+        rank = self.r[adapter_name]
+        coft = self.coft[adapter_name]
+        eps = self.eps[adapter_name]
+
+        if coft:
+            with torch.no_grad():
+                oft_r.copy_(self._project_batch(oft_r, eps=eps))
+
+        orth_rotate = self._cayley_batch(oft_r)
+        weight = self._block_diagonal(orth_rotate, rank)
+
+        return weight, oft_s
+
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            oft_rotation = torch.eye(
+                self.in_features * self.get_base_layer().kernel_size[0] * self.get_base_layer().kernel_size[0],
+                device=x.device,
+                dtype=previous_dtype,
+            )
+            oft_scale = torch.ones((int(self.out_features), 1), device=x.device, dtype=previous_dtype)
+
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.oft_r.keys():
+                    continue
+                oft_r = self.oft_r[active_adapter]
+                oft_s = self.oft_s[active_adapter]
+                dropout = self.oft_dropout[active_adapter]
+
+                rank = self.r[active_adapter]
+                coft = self.coft[active_adapter]
+                eps = self.eps[active_adapter]
+
+                if coft:
+                    with torch.no_grad():
+                        oft_r.copy_(self._project_batch(oft_r, eps=eps))
+
+                orth_rotate = self._cayley_batch(oft_r)
+                orth_rotate = dropout(orth_rotate)
+                oft_mat = self._block_diagonal(orth_rotate, rank)
+
+                oft_rotation = oft_mat @ oft_rotation
+                oft_scale = oft_s * oft_scale
+
+            x = x.to(self.get_base_layer().weight.data.dtype)
+
+            orig_weights = self.base_layer.weight.data
+            orig_weights = orig_weights.view(
+                self.out_features,
+                self.in_features * self.get_base_layer().kernel_size[0] * self.get_base_layer().kernel_size[0],
+            )
+            orig_weights = torch.transpose(orig_weights, 0, 1)
+            oft_rotation = oft_rotation.to(previous_dtype)
+            orig_weights = orig_weights.to(previous_dtype)
+            rotated_weight = torch.mm(oft_rotation, orig_weights)
+            rotated_weight = torch.transpose(rotated_weight, 0, 1)
+
+            scaled_rotated_weight = rotated_weight * oft_scale
+
+            scaled_rotated_weight = scaled_rotated_weight.view(
+                self.out_features,
+                self.in_features,
+                self.get_base_layer().kernel_size[0],
+                self.get_base_layer().kernel_size[0],
+            )
+            result = F.conv2d(
+                input=x,
+                weight=scaled_rotated_weight,
+                bias=self.get_base_layer().bias,
+                padding=self.get_base_layer().padding[0],
+                stride=self.get_base_layer().stride[0],
+            )
+
+        result = result.to(previous_dtype)
+        return result
 
     def __repr__(self) -> str:
         rep = super().__repr__()
diff --git a/src/peft/tuners/oft/model.py b/src/peft/tuners/oft/model.py
index d2530295b6..e44ced3b13 100644
--- a/src/peft/tuners/oft/model.py
+++ b/src/peft/tuners/oft/model.py
@@ -12,18 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
-from typing import Dict, Type, Union
+import warnings
+from dataclasses import asdict
+from enum import Enum
+from typing import List, Optional
 
 import torch
 from torch import nn
+from tqdm import tqdm
 
-from peft.tuners.lycoris_utils import LycorisConfig, LycorisTuner
+from peft.tuners.tuners_utils import (
+    BaseTuner,
+    BaseTunerLayer,
+    check_target_module_exists,
+    onload_layer,
+)
+from peft.utils import (
+    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
+    ModulesToSaveWrapper,
+    _get_submodules,
+)
 
+from .config import OFTConfig
 from .layer import Conv2d, Linear, OFTLayer
 
 
-class OFTModel(LycorisTuner):
+class OFTModel(BaseTuner):
     """
     Creates Orthogonal Finetuning model from a pretrained model. The method is described in
     https://arxiv.org/abs/2306.07280
@@ -76,33 +90,285 @@ class OFTModel(LycorisTuner):
     """
 
     prefix: str = "oft_"
-    layers_mapping: Dict[Type[torch.nn.Module], Type[OFTLayer]] = {
-        torch.nn.Conv2d: Conv2d,
-        torch.nn.Linear: Linear,
-    }
+
+    def __init__(self, model, config, adapter_name, low_cpu_mem_usage: bool = False) -> None:
+        super().__init__(model, config, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage)
+
+    def _check_new_adapter_config(self, config: OFTConfig) -> None:
+        """
+        A helper method to check the config when a new adapter is being added.
+
+        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
+
+        """
+        # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
+        # does not fully correspond to the error message.
+        if (len(self.peft_config) > 1) and (config.bias != "none"):
+            raise ValueError(
+                f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
+                "set bias to 'none' for all adapters."
+            )
+
+    @staticmethod
+    def _check_target_module_exists(oft_config, key):
+        return check_target_module_exists(oft_config, key)
 
     def _create_and_replace(
         self,
-        config: LycorisConfig,
-        adapter_name: str,
-        target: Union[OFTLayer, nn.Module],
-        target_name: str,
-        parent: nn.Module,
-        current_key: str,
-    ) -> None:
+        oft_config,
+        adapter_name,
+        target,
+        target_name,
+        parent,
+        current_key,
+        **optional_kwargs,
+    ):
+        if current_key is None:
+            raise ValueError("Current Key shouldn't be `None`")
+
+        bias = hasattr(target, "bias") and target.bias is not None
+        kwargs = {
+            "r": oft_config.r,
+            "oft_block_size": oft_config.oft_block_size,
+            "module_dropout": oft_config.module_dropout,
+            "coft": oft_config.coft,
+            "eps": oft_config.eps,
+            "block_share": oft_config.block_share,
+            "fan_in_fan_out": oft_config.fan_in_fan_out,
+            "init_weights": oft_config.init_weights,
+        }
+        kwargs["bias"] = bias
+
+        # If it is not a OFTLayer, create a new module, else update it with new adapters
+        if not isinstance(target, OFTLayer):
+            new_module = self._create_new_module(oft_config, adapter_name, target, **kwargs)
+            if adapter_name not in self.active_adapters:
+                # adding an additional adapter: it is not automatically trainable
+                new_module.requires_grad_(False)
+            self._replace_module(parent, target_name, new_module, target)
+        else:
+            target.update_layer(
+                adapter_name,
+                r=oft_config.r,
+                oft_block_size=oft_config.oft_block_size,
+                module_dropout=oft_config.module_dropout,
+                coft=oft_config.coft,
+                eps=oft_config.eps,
+                block_share=oft_config.block_share,
+                init_weights=oft_config.init_weights,
+            )
+
+    def _replace_module(self, parent, child_name, new_module, child):
+        setattr(parent, child_name, new_module)
+        # It's not necessary to set requires_grad here, as that is handled by
+        # _mark_only_adapters_as_trainable
+
+        # child layer wraps the original module, unpack it
+        if hasattr(child, "base_layer"):
+            child = child.base_layer
+
+        if not hasattr(new_module, "base_layer"):
+            new_module.weight = child.weight
+            if hasattr(child, "bias"):
+                new_module.bias = child.bias
+
+        if getattr(child, "state", None) is not None:
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        meta = torch.device("meta")
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if self.prefix in name:
+                if not any(p.device == meta for p in module.parameters()):
+                    module.to(child.weight.device)
+
+    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
+        for n, p in model.named_parameters():
+            if self.prefix not in n:
+                p.requires_grad = False
+
+        for active_adapter in self.active_adapters:
+            bias = self.peft_config[active_adapter].bias
+            if bias == "none":
+                continue
+
+            if bias == "all":
+                for n, p in model.named_parameters():
+                    if "bias" in n:
+                        p.requires_grad = True
+            elif bias == "oft_only":
+                for name, m in model.named_modules():
+                    if isinstance(m, OFTLayer) and hasattr(m, "bias") and m.bias is not None:
+                        m.bias.requires_grad = True
+            else:
+                raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
+
+    @staticmethod
+    def _create_new_module(oft_config, adapter_name, target, **kwargs):
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
+        else:
+            target_base_layer = target
+
+        if isinstance(target_base_layer, torch.nn.Linear):
+            if kwargs["fan_in_fan_out"]:
+                warnings.warn(
+                    "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                    "Setting fan_in_fan_out to False."
+                )
+                kwargs["fan_in_fan_out"] = oft_config.fan_in_fan_out = False
+            new_module = Linear(target, adapter_name, **kwargs)
+        elif isinstance(target_base_layer, torch.nn.Conv2d):
+            new_module = Conv2d(target, adapter_name, **kwargs)
+        else:
+            raise ValueError(
+                f"Target module {target} is not supported. "
+                "Currently, only `torch.nn.Linear` and `torch.nn.Conv2d` are supported."
+            )
+
+        return new_module
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            if name == "model":  # see #1892: prevent infinite recursion if class is not initialized
+                raise
+            return getattr(self.model, name)
+
+    def get_peft_config_as_dict(self, inference: bool = False):
+        config_dict = {}
+        for key, value in self.peft_config.items():
+            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
+            if inference:
+                config["inference_mode"] = True
+        config_dict[key] = config
+        return config
+
+    def _set_adapter_layers(self, enabled=True):
+        for module in self.model.modules():
+            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def enable_adapter_layers(self):
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self):
+        for active_adapter in self.active_adapters:
+            val = self.peft_config[active_adapter].bias
+            if val != "none":
+                msg = (
+                    f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
+                    "output as the the base model would without adaption."
+                )
+                warnings.warn(msg)
+        self._set_adapter_layers(enabled=False)
+
+    def set_adapter(self, adapter_name):
+        for module in self.model.modules():
+            if isinstance(module, OFTLayer):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+        self.active_adapter = adapter_name
+
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+            peft_config.target_modules = set(
+                TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
+            )
+        return peft_config
+
+    def _unload_and_optionally_merge(
+        self,
+        merge=True,
+        progressbar: bool = False,
+        safe_merge: bool = False,
+        adapter_names: Optional[List[str]] = None,
+    ):
+        if merge:
+            self._check_merge_allowed()
+
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        desc = "Unloading " + ("and merging " if merge else "") + "model"
+        for key in tqdm(key_list, disable=not progressbar, desc=desc):
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+            with onload_layer(target):
+                if hasattr(target, "base_layer"):
+                    if merge:
+                        target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                    self._replace_module(parent, target_name, target.get_base_layer(), target)
+                elif isinstance(target, ModulesToSaveWrapper):
+                    # save any additional trainable modules part of `modules_to_save`
+                    new_module = target.modules_to_save[target.active_adapter]
+                    if hasattr(new_module, "base_layer"):
+                        # check if the module is itself a tuner layer
+                        if merge:
+                            new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                        new_module = new_module.get_base_layer()
+                    setattr(parent, target_name, new_module)
+
+        return self.model
+
+    def delete_adapter(self, adapter_name: str) -> None:
         """
-        A private method to create and replace the target module with the adapter module.
+        Deletes an existing adapter.
+
+        Args:
+            adapter_name (str): Name of the adapter to be deleted.
         """
+        if adapter_name not in list(self.peft_config.keys()):
+            raise ValueError(f"Adapter {adapter_name} does not exist")
+        del self.peft_config[adapter_name]
 
-        # Regexp matching - Find key which matches current target_name in patterns provided
-        pattern_keys = list(config.rank_pattern.keys())
-        target_name_key = next(filter(lambda key: re.match(rf"(.*\.)?{key}$", current_key), pattern_keys), target_name)
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        new_adapter = None
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, OFTLayer):
+                target.delete_adapter(adapter_name)
+                if new_adapter is None:
+                    new_adapter = target.active_adapters[:]
 
-        kwargs = config.to_dict()
-        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
+        self.active_adapter = new_adapter or []
 
-        if isinstance(target, OFTLayer):
-            target.update_layer(adapter_name, **kwargs)
-        else:
-            new_module = self._create_new_module(config, adapter_name, target, **kwargs)
-            self._replace_module(parent, target_name, new_module, target)
+    def merge_and_unload(
+        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[List[str]] = None
+    ) -> torch.nn.Module:
+        r"""
+        This method merges the OFT layers into the base model. This is needed if someone wants to use the base model as
+        a standalone model.
+
+        Args:
+            progressbar (`bool`):
+                whether to show a progressbar indicating the unload and merge process
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+
+        """
+        return self._unload_and_optionally_merge(
+            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
+        )
+
+    def unload(self) -> torch.nn.Module:
+        """
+        Gets back the base model by removing all the oft modules without merging. This gives back the original base
+        model.
+        """
+        return self._unload_and_optionally_merge(merge=False)
diff --git a/tests/test_config.py b/tests/test_config.py
index 716c28e999..ac76eade88 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -34,7 +34,6 @@
     LoKrConfig,
     LoraConfig,
     MultitaskPromptTuningConfig,
-    OFTConfig,
     PeftConfig,
     PeftType,
     PolyConfig,
@@ -61,7 +60,6 @@
     LoKrConfig,
     LoraConfig,
     MultitaskPromptTuningConfig,
-    OFTConfig,
     PolyConfig,
     PrefixTuningConfig,
     PromptEncoderConfig,
@@ -242,7 +240,7 @@ def test_prompt_encoder_warning_num_layers(self):
         expected_msg = "for MLP, the argument `encoder_num_layers` is ignored. Exactly 2 MLP layers are used."
         assert str(record.list[0].message) == expected_msg
 
-    @parameterized.expand([LoHaConfig, LoraConfig, IA3Config, OFTConfig, BOFTConfig, HRAConfig, VBLoRAConfig])
+    @parameterized.expand([LoHaConfig, LoraConfig, IA3Config, BOFTConfig, HRAConfig, VBLoRAConfig])
     def test_save_pretrained_with_target_modules(self, config_class):
         # See #1041, #1045
         config = config_class(target_modules=["a", "list"])
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index 4b163fe848..aa747ad245 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -266,25 +266,26 @@
     ########
     # OFT #
     ########
-    ("Vanilla MLP 1 OFT", "MLP", OFTConfig, {"target_modules": "lin0"}),
-    ("Vanilla MLP 2 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"]}),
-    ("Vanilla MLP 5 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}),
+    ("Vanilla MLP 1 OFT", "MLP", OFTConfig, {"r": 2, "target_modules": "lin0"}),
+    ("Vanilla MLP 2 OFT", "MLP", OFTConfig, {"r": 2, "target_modules": ["lin0"]}),
+    ("Vanilla MLP 5 OFT", "MLP", OFTConfig, {"r": 2, "target_modules": ["lin0"], "modules_to_save": ["lin1"]}),
     (
         "Vanilla MLP 6 OFT",
         "MLP",
         OFTConfig,
         {
+            "r": 2,
             "target_modules": ["lin0"],
             "module_dropout": 0.1,
         },
     ),
-    ("Vanilla MLP 7 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"], "coft": True}),
-    ("Vanilla MLP 8 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"], "block_share": True}),
-    ("Vanilla MLP 9 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"], "coft": True, "block_share": True}),
-    ("Conv2d 1 OFT", "Conv2d", OFTConfig, {"target_modules": ["conv2d"]}),
-    ("Conv2d 3 OFT", "Conv2d", OFTConfig, {"target_modules": ["conv2d"], "coft": True}),
-    ("Conv2d 4 OFT", "Conv2d", OFTConfig, {"target_modules": ["conv2d"], "block_share": True}),
-    ("Conv2d 5 OFT", "Conv2d", OFTConfig, {"target_modules": ["conv2d"], "coft": True, "block_share": True}),
+    ("Vanilla MLP 7 OFT", "MLP", OFTConfig, {"r": 2, "target_modules": ["lin0"], "coft": True}),
+    ("Vanilla MLP 8 OFT", "MLP", OFTConfig, {"r": 2, "target_modules": ["lin0"], "block_share": True}),
+    ("Vanilla MLP 9 OFT", "MLP", OFTConfig, {"r": 2, "target_modules": ["lin0"], "coft": True, "block_share": True}),
+    ("Conv2d 1 OFT", "Conv2d", OFTConfig, {"r": 5, "target_modules": ["conv2d"]}),
+    ("Conv2d 3 OFT", "Conv2d", OFTConfig, {"r": 5, "target_modules": ["conv2d"], "coft": True}),
+    ("Conv2d 4 OFT", "Conv2d", OFTConfig, {"r": 5, "target_modules": ["conv2d"], "block_share": True}),
+    ("Conv2d 5 OFT", "Conv2d", OFTConfig, {"r": 5, "target_modules": ["conv2d"], "coft": True, "block_share": True}),
     ########
     # HRA #
     ########
@@ -1419,7 +1420,7 @@ def test_multiple_adapters_automatic_modules_to_save(self):
         assert "default" in model.base_model.classifier.modules_to_save
         assert "other" in model.base_model.classifier.modules_to_save
 
-    @parameterized.expand([IA3Config, LoHaConfig, LoKrConfig, LoraConfig, OFTConfig, HRAConfig])
+    @parameterized.expand([IA3Config, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig])
     def test_multiple_adapters_mixed_modules_to_save(self, config_cls):
         # See issue 1574
         # Check that we can have a model where one adapter has modules_to_save and the other doesn't. It should be
@@ -1444,7 +1445,7 @@ def test_multiple_adapters_mixed_modules_to_save(self, config_cls):
         model.set_adapter("other")
         model(**inputs)
 
-    @parameterized.expand([IA3Config, LoHaConfig, LoKrConfig, LoraConfig, OFTConfig, HRAConfig])
+    @parameterized.expand([IA3Config, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig])
     def test_multiple_adapters_mixed_modules_to_save_order_switched(self, config_cls):
         # See issue 1574
         # Same test as test_multiple_adapters_mixed_modules_to_save, but this time the 2nd adapter has modules_to_save.
@@ -1647,7 +1648,7 @@ def test_load_resized_embedding_ignore_mismatched_sizes(self):
             LoHaConfig(target_modules=["lin0"], init_weights=False),
             AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False),
             IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"], init_ia3_weights=False),
-            OFTConfig(target_modules=["lin0"], init_weights=False),
+            OFTConfig(target_modules=["lin0"], init_weights=False, r=2),
             BOFTConfig(target_modules=["lin0"], init_weights=False, boft_block_size=2),
             HRAConfig(target_modules=["lin0"], init_weights=False),
         ]
@@ -2726,16 +2727,17 @@ def test_requires_grad_lokr_same_targets(self):
 
     def test_requires_grad_oft_different_targets(self):
         # test two different OFT adapters that target different modules
-        config0 = OFTConfig(target_modules=["lin0"])
+        config0 = OFTConfig(target_modules=["lin0"], r=2)
         peft_model = get_peft_model(MLP(), config0)
 
-        config1 = OFTConfig(target_modules=["lin1"], inference_mode=True)
+        config1 = OFTConfig(target_modules=["lin1"], r=2, inference_mode=True)
         peft_model.add_adapter("adapter1", config1)
 
         # active adapter is still "default"
         self.check_requires_grad(
             peft_model,
             "base_model.model.lin0.oft_r.default",
+            "base_model.model.lin0.oft_s.default",
         )
 
         # set config0 as active, should not change anything
@@ -2743,6 +2745,7 @@ def test_requires_grad_oft_different_targets(self):
         self.check_requires_grad(
             peft_model,
             "base_model.model.lin0.oft_r.default",
+            "base_model.model.lin0.oft_s.default",
         )
 
         # change activate pter to pter1
@@ -2750,6 +2753,7 @@ def test_requires_grad_oft_different_targets(self):
         self.check_requires_grad(
             peft_model,
             "base_model.model.lin1.oft_r.adapter1",
+            "base_model.model.lin1.oft_s.adapter1",
         )
 
         # disable all pters
@@ -2760,20 +2764,22 @@ def test_requires_grad_oft_different_targets(self):
         self.check_requires_grad(
             peft_model,
             "base_model.model.lin1.oft_r.adapter1",
+            "base_model.model.lin1.oft_s.adapter1",
         )
 
     def test_requires_grad_oft_same_targets(self):
         # same as previous test, except that OFT adapters target the same layer
-        config0 = OFTConfig(target_modules=["lin0"])
+        config0 = OFTConfig(target_modules=["lin0"], r=2)
         peft_model = get_peft_model(MLP(), config0)
 
-        config1 = OFTConfig(target_modules=["lin0"], inference_mode=True)
+        config1 = OFTConfig(target_modules=["lin0"], r=2, inference_mode=True)
         peft_model.add_adapter("adapter1", config1)
 
         # active adapter is still "default"
         self.check_requires_grad(
             peft_model,
             "base_model.model.lin0.oft_r.default",
+            "base_model.model.lin0.oft_s.default",
         )
 
         # set config0 as active, should not change anything
@@ -2781,6 +2787,7 @@ def test_requires_grad_oft_same_targets(self):
         self.check_requires_grad(
             peft_model,
             "base_model.model.lin0.oft_r.default",
+            "base_model.model.lin0.oft_s.default",
         )
 
         # change activate adapter to adapter1
@@ -2788,6 +2795,7 @@ def test_requires_grad_oft_same_targets(self):
         self.check_requires_grad(
             peft_model,
             "base_model.model.lin0.oft_r.adapter1",
+            "base_model.model.lin0.oft_s.adapter1",
         )
 
         # disable all adapters
@@ -2799,6 +2807,7 @@ def test_requires_grad_oft_same_targets(self):
         self.check_requires_grad(
             peft_model,
             "base_model.model.lin0.oft_r.adapter1",
+            "base_model.model.lin0.oft_s.adapter1",
         )
 
     def test_requires_grad_hra_different_targets(self):
diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py
index dd0aeeca6e..6204db93f6 100644
--- a/tests/test_decoder_models.py
+++ b/tests/test_decoder_models.py
@@ -24,6 +24,7 @@
     BOFTConfig,
     HRAConfig,
     LoraConfig,
+    OFTConfig,
     PrefixTuningConfig,
     PromptTuningConfig,
     PromptTuningInit,
@@ -55,21 +56,29 @@ def skip_adalora_and_gpt2(test_list):
     return [test for test in test_list if not (("GPT2LMHeadModel" in test[1]) and (test[2] == AdaLoraConfig))]
 
 
-def skip_boft_or_hra_and_gpt2(test_list):
+def skip_oft_or_hra_and_gpt2(test_list):
     return [
         test
         for test in test_list
-        if not (("GPT2LMHeadModel" in test[1]) and ((test[2] == BOFTConfig) or (test[2] == HRAConfig)))
+        if not (
+            ("GPT2LMHeadModel" in test[1])
+            and ((test[2] == BOFTConfig) or (test[2] == HRAConfig) or (test[2] == OFTConfig))
+        )
     ]
 
 
-def skip_adalora_or_boft_or_hra_and_gpt2(test_list):
+def skip_adalora_or_oft_or_hra_and_gpt2(test_list):
     return [
         test
         for test in test_list
         if not (
             ("GPT2LMHeadModel" in test[1])
-            and ((test[2] == AdaLoraConfig) or (test[2] == BOFTConfig) or (test[2] == HRAConfig))
+            and (
+                (test[2] == AdaLoraConfig)
+                or (test[2] == BOFTConfig)
+                or (test[2] == HRAConfig)
+                or (test[2] == OFTConfig)
+            )
         )
     ]
 
@@ -96,19 +105,19 @@ def prepare_inputs_for_testing(self):
         return input_dict
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_attributes_parametrized(self, test_name, model_id, config_cls, config_kwargs):
         self._test_model_attr(model_id, config_cls, config_kwargs)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_adapter_name(self, test_name, model_id, config_cls, config_kwargs):
         self._test_adapter_name(model_id, config_cls, config_kwargs)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_prepare_for_training_parametrized(self, test_name, model_id, config_cls, config_kwargs):
         self._test_prepare_for_training(model_id, config_cls, config_kwargs)
@@ -168,31 +177,31 @@ def test_prompt_tuning_config_invalid_args(self):
             )
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_save_pretrained(self, test_name, model_id, config_cls, config_kwargs):
         self._test_save_pretrained(model_id, config_cls, config_kwargs)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_save_pretrained_pickle(self, test_name, model_id, config_cls, config_kwargs):
         self._test_save_pretrained(model_id, config_cls, config_kwargs, safe_serialization=False)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_save_pretrained_selected_adapters(self, test_name, model_id, config_cls, config_kwargs):
         self._test_save_pretrained_selected_adapters(model_id, config_cls, config_kwargs)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_save_pretrained_selected_adapters_pickle(self, test_name, model_id, config_cls, config_kwargs):
         self._test_save_pretrained_selected_adapters(model_id, config_cls, config_kwargs, safe_serialization=False)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_from_pretrained_config_construction(self, test_name, model_id, config_cls, config_kwargs):
         self._test_from_pretrained_config_construction(model_id, config_cls, config_kwargs)
@@ -205,6 +214,7 @@ def test_from_pretrained_config_construction(self, test_name, model_id, config_c
                 "adalora_kwargs": {"init_lora_weights": [False]},
                 "ia3_kwargs": {"init_ia3_weights": [False]},
                 "boft_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
                 "vera_kwargs": {"init_weights": [False]},
                 "fourierft_kwargs": {"init_weights": [False]},
                 "hra_kwargs": {"init_weights": [False]},
@@ -222,12 +232,13 @@ def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs):
                 "lora_kwargs": {"init_lora_weights": [False]},
                 "ia3_kwargs": {"init_ia3_weights": [False]},
                 "boft_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
                 "vera_kwargs": {"init_weights": [False]},
                 "fourierft_kwargs": {"init_weights": [False]},
                 "hra_kwargs": {"init_weights": [False]},
                 "task_type": "CAUSAL_LM",
             },
-            filter_params_func=skip_boft_or_hra_and_gpt2,
+            filter_params_func=skip_oft_or_hra_and_gpt2,
         )
     )
     def test_merge_layers_multi(self, test_name, model_id, config_cls, config_kwargs):
@@ -240,6 +251,7 @@ def test_merge_layers_multi(self, test_name, model_id, config_cls, config_kwargs
                 "lora_kwargs": {"init_lora_weights": [False]},
                 "ia3_kwargs": {"init_ia3_weights": [False]},
                 "boft_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
                 "task_type": "CAUSAL_LM",
             },
         )
@@ -260,13 +272,13 @@ def test_mixed_adapter_batches(self, test_name, model_id, config_cls, config_kwa
         self._test_mixed_adapter_batches(model_id, config_cls, config_kwargs)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_generate(self, test_name, model_id, config_cls, config_kwargs):
         self._test_generate(model_id, config_cls, config_kwargs)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_generate_pos_args(self, test_name, model_id, config_cls, config_kwargs):
         # positional args are supported for PeftModelForCausalLM
@@ -285,7 +297,7 @@ def test_prefix_tuning_half_prec_conversion(self, test_name, model_id, config_cl
         self._test_prefix_tuning_half_prec_conversion(model_id, config_cls, config_kwargs)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_training_decoders(self, test_name, model_id, config_cls, config_kwargs):
         self._test_training(model_id, config_cls, config_kwargs)
@@ -295,13 +307,13 @@ def test_training_decoders_layer_indexing(self, test_name, model_id, config_cls,
         self._test_training_layer_indexing(model_id, config_cls, config_kwargs)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_training_decoders_gradient_checkpointing(self, test_name, model_id, config_cls, config_kwargs):
         self._test_training_gradient_checkpointing(model_id, config_cls, config_kwargs)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_inference_safetensors(self, test_name, model_id, config_cls, config_kwargs):
         self._test_inference_safetensors(model_id, config_cls, config_kwargs)
@@ -311,19 +323,19 @@ def test_peft_model_device_map(self, test_name, model_id, config_cls, config_kwa
         self._test_peft_model_device_map(model_id, config_cls, config_kwargs)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_delete_adapter(self, test_name, model_id, config_cls, config_kwargs):
         self._test_delete_adapter(model_id, config_cls, config_kwargs)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_delete_inactive_adapter(self, test_name, model_id, config_cls, config_kwargs):
         self._test_delete_inactive_adapter(model_id, config_cls, config_kwargs)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_adding_multiple_adapters_with_bias_raises(self, test_name, model_id, config_cls, config_kwargs):
         self._test_adding_multiple_adapters_with_bias_raises(model_id, config_cls, config_kwargs)
@@ -336,12 +348,13 @@ def test_adding_multiple_adapters_with_bias_raises(self, test_name, model_id, co
                 "adalora_kwargs": {"init_lora_weights": [False]},
                 "ia3_kwargs": {"init_ia3_weights": [False]},
                 "boft_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
                 "vera_kwargs": {"init_weights": [False]},
                 "fourierft_kwargs": {"init_weights": [False]},
                 "hra_kwargs": {"init_weights": [False]},
                 "task_type": "CAUSAL_LM",
             },
-            filter_params_func=skip_adalora_or_boft_or_hra_and_gpt2,
+            filter_params_func=skip_adalora_or_oft_or_hra_and_gpt2,
         )
     )
     def test_unload_adapter(self, test_name, model_id, config_cls, config_kwargs):
@@ -354,6 +367,7 @@ def test_unload_adapter(self, test_name, model_id, config_cls, config_kwargs):
                 "lora_kwargs": {"init_lora_weights": [False]},
                 "ia3_kwargs": {"init_ia3_weights": [False]},
                 "boft_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
                 "task_type": "CAUSAL_LM",
             },
         )
@@ -373,12 +387,13 @@ def test_training_prompt_learning_tasks(self, test_name, model_id, config_cls, c
                 "ia3_kwargs": {"init_ia3_weights": [False]},
                 "adalora_kwargs": {"init_lora_weights": [False]},
                 "boft_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
                 "vera_kwargs": {"init_weights": [False]},
                 "fourierft_kwargs": {"init_weights": [False]},
                 "hra_kwargs": {"init_weights": [False]},
                 "task_type": "CAUSAL_LM",
             },
-            filter_params_func=skip_boft_or_hra_and_gpt2,
+            filter_params_func=skip_oft_or_hra_and_gpt2,
         )
     )
     def test_disable_adapter(self, test_name, model_id, config_cls, config_kwargs):
@@ -395,7 +410,7 @@ def test_generate_adalora_no_dropout(self):
         self._test_generate(model_id, AdaLoraConfig, config_kwargs)
 
     @parameterized.expand(
-        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_boft_or_hra_and_gpt2)
+        PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2)
     )
     def test_passing_input_embeds_works(self, test_name, model_id, config_cls, config_kwargs):
         self._test_passing_input_embeds_works(test_name, model_id, config_cls, config_kwargs)
diff --git a/tests/test_encoder_decoder_models.py b/tests/test_encoder_decoder_models.py
index dea757c266..2b9f68fc21 100644
--- a/tests/test_encoder_decoder_models.py
+++ b/tests/test_encoder_decoder_models.py
@@ -173,6 +173,7 @@ def test_adding_multiple_adapters_with_bias_raises(self, test_name, model_id, co
                 "adalora_kwargs": {"init_lora_weights": [False]},
                 "ia3_kwargs": {"init_ia3_weights": [False]},
                 "boft_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
                 "vera_kwargs": {"init_weights": [False]},
                 "hra_kwargs": {"init_weights": [False]},
                 "task_type": "SEQ_2_SEQ_LM",
@@ -207,6 +208,7 @@ def test_training_prompt_learning_tasks(self, test_name, model_id, config_cls, c
                 "adalora_kwargs": {"init_lora_weights": [False]},
                 "ia3_kwargs": {"init_ia3_weights": [False]},
                 "boft_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
                 "vera_kwargs": {"init_weights": [False]},
                 "hra_kwargs": {"init_weights": [False]},
                 "task_type": "SEQ_2_SEQ_LM",
diff --git a/tests/test_feature_extraction_models.py b/tests/test_feature_extraction_models.py
index 5521c1125d..05cbeb73d4 100644
--- a/tests/test_feature_extraction_models.py
+++ b/tests/test_feature_extraction_models.py
@@ -111,6 +111,7 @@ def test_from_pretrained_config_construction(self, test_name, model_id, config_c
                 "adalora_kwargs": {"init_lora_weights": [False]},
                 "ia3_kwargs": {"init_ia3_weights": [False]},
                 "boft_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
                 "vera_kwargs": {"init_weights": [False]},
                 "hra_kwargs": {"init_weights": [False]},
                 "task_type": "FEATURE_EXTRACTION",
@@ -164,6 +165,7 @@ def test_delete_inactive_adapter(self, test_name, model_id, config_cls, config_k
                 "adalora_kwargs": {"init_lora_weights": [False]},
                 "ia3_kwargs": {"init_ia3_weights": [False]},
                 "boft_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
                 "vera_kwargs": {"init_weights": [False]},
                 "hra_kwargs": {"init_weights": [False]},
                 "task_type": "FEATURE_EXTRACTION",
@@ -180,6 +182,7 @@ def test_unload_adapter(self, test_name, model_id, config_cls, config_kwargs):
                 "lora_kwargs": {"init_lora_weights": [False]},
                 "ia3_kwargs": {"init_ia3_weights": [False]},
                 "boft_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
                 "hra_kwargs": {"init_weights": [False]},
                 "task_type": "FEATURE_EXTRACTION",
             },
diff --git a/tests/test_mixed.py b/tests/test_mixed.py
index 41e9aceae0..3845046b4e 100644
--- a/tests/test_mixed.py
+++ b/tests/test_mixed.py
@@ -15,6 +15,7 @@
 import copy
 import itertools
 import os
+import platform
 import re
 import tempfile
 import unittest
@@ -30,7 +31,6 @@
     LoHaConfig,
     LoKrConfig,
     LoraConfig,
-    OFTConfig,
     PeftMixedModel,
     PrefixTuningConfig,
     get_peft_model,
@@ -396,7 +396,6 @@ def _check_loading(self, model_cls, config0, config1, input, *, is_commutative):
                 LoHaConfig(target_modules=["lin0"], init_weights=False),
                 LoKrConfig(target_modules=["lin0"], init_weights=False),
                 AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False),
-                OFTConfig(target_modules=["lin0"], init_weights=False),
             ],
             r=2,
         ),
@@ -417,7 +416,6 @@ def test_target_first_layer(self, config0, config1):
                 LoHaConfig(target_modules=["lin1"], init_weights=False),
                 LoKrConfig(target_modules=["lin1"], init_weights=False),
                 AdaLoraConfig(target_modules=["lin1"], init_lora_weights=False),
-                OFTConfig(target_modules=["lin1"], init_weights=False),
             ],
             r=2,
         ),
@@ -428,14 +426,12 @@ def test_target_last_layer(self, config0, config1):
         # to the output, the results should be commutative. This would *not* work if the adapters do something more
         # complex or if we target an earlier layer, because of the non-linearity would destroy the commutativity.
         input = torch.arange(90).reshape(9, 10).to(self.torch_device)
-        # OFT is not commutative, as it's not a linear operation on the inputs
-        is_commutative = not any(isinstance(config, OFTConfig) for config in [config0, config1])
 
-        self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=is_commutative)
+        self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=True)
         self._check_merging(SimpleNet, config0, config1, input)
         self._check_unload(SimpleNet, config0, config1, input)
         self._check_disable(SimpleNet, config1, config0, input)
-        self._check_loading(SimpleNet, config0, config1, input, is_commutative=is_commutative)
+        self._check_loading(SimpleNet, config0, config1, input, is_commutative=True)
 
     @parameterized.expand(
         itertools.combinations(
@@ -444,7 +440,6 @@ def test_target_last_layer(self, config0, config1):
                 LoHaConfig(init_weights=False),
                 LoKrConfig(init_weights=False),
                 AdaLoraConfig(init_lora_weights=False),
-                OFTConfig(init_weights=False),
             ],
             r=2,
         ),
@@ -488,19 +483,13 @@ def test_target_different_layers(self, config0, config1):
                 AdaLoraConfig(target_modules=["lin1"], init_lora_weights=False),
                 AdaLoraConfig(target_modules=["lin1"], init_lora_weights=False),
             ),
-            (
-                OFTConfig(target_modules=["lin1"], init_weights=False),
-                OFTConfig(target_modules=["lin1"], init_weights=False),
-            ),
         ],
         name_func=_param_name_func,
     )
     def test_target_last_layer_same_type(self, config0, config1):
         input = torch.arange(90).reshape(9, 10).to(self.torch_device)
-        # OFT is not commutative, as it's not a linear operation on the inputs
-        is_commutative = not any(isinstance(config, OFTConfig) for config in [config0, config1])
 
-        self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=is_commutative)
+        self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=True)
         self._check_merging(SimpleNet, config0, config1, input)
         self._check_unload(SimpleNet, config0, config1, input)
         self._check_disable(SimpleNet, config1, config0, input)
@@ -523,10 +512,6 @@ def test_target_last_layer_same_type(self, config0, config1):
                 AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False),
                 AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False),
             ),
-            (
-                OFTConfig(target_modules=["lin0"], init_weights=False),
-                OFTConfig(target_modules=["lin0"], init_weights=False),
-            ),
         ],
         name_func=_param_name_func,
     )
@@ -540,6 +525,9 @@ def test_target_first_layer_same_type(self, config0, config1):
 
     def test_deeply_nested(self):
         # a somewhat absurdly nested model using different adapter types
+        if platform.system() == "Linux":
+            self.skipTest("This test fails but only on GitHub CI with Linux systems.")
+
         atol = 1e-5
         rtol = 1e-5
         torch.manual_seed(0)
@@ -560,10 +548,7 @@ def test_deeply_nested(self):
         config3 = LoKrConfig(r=4, alpha=4, target_modules=["lin0", "lin1"], init_weights=False)
         peft_model.add_adapter("adapter3", config3)
 
-        config4 = OFTConfig(r=8, target_modules=["lin0", "lin1"], init_weights=False)
-        peft_model.add_adapter("adapter4", config4)
-
-        peft_model.set_adapter(["adapter0", "adapter1", "adapter2", "adapter3", "adapter4"])
+        peft_model.set_adapter(["adapter0", "adapter1", "adapter2", "adapter3"])
         output_mixed = peft_model(input)
         assert torch.isfinite(output_base).all()
         assert not torch.allclose(output_base, output_mixed, atol=atol, rtol=rtol)
@@ -589,7 +574,7 @@ def test_deeply_nested(self):
         assert torch.isfinite(output_13).all()
         assert not torch.allclose(output_mixed, output_13, atol=atol, rtol=rtol)
 
-        model_copy.set_adapter(["adapter0", "adapter1", "adapter2", "adapter3", "adapter4"])
+        model_copy.set_adapter(["adapter0", "adapter1", "adapter2", "adapter3"])
         model_merged_unloaded = model_copy.merge_and_unload(adapter_names=["adapter1", "adapter3"])
         output_merged_13 = model_merged_unloaded(input)
         assert torch.isfinite(output_merged_13).all()
@@ -763,12 +748,7 @@ def test_decoder_model(self):
         assert not torch.allclose(output2, output3)
 
         torch.manual_seed(4)
-        config4 = OFTConfig(task_type="CAUSAL_LM", target_modules=["q_proj", "v_proj"], init_weights=False)
-        peft_model.add_adapter("adapter4", config4)
-        peft_model.set_adapter(["adapter0", "adapter1", "adapter2", "adapter3", "adapter4"])
-        output4 = peft_model.generate(**input_dict)
-        assert torch.isfinite(output4).all()
-        assert not torch.allclose(output3, output4)
+        peft_model.set_adapter(["adapter0", "adapter1", "adapter2", "adapter3"])
 
         with peft_model.disable_adapter():
             output_disabled = peft_model.generate(**input_dict)
@@ -778,7 +758,6 @@ def test_decoder_model(self):
         model_unloaded = peft_model.merge_and_unload()
         output_unloaded = model_unloaded.generate(**input_dict)
         assert torch.isfinite(output_unloaded).all()
-        assert torch.allclose(output4, output_unloaded)
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             # save adapter0 (use normal PeftModel, because PeftMixedModel does not support saving)
diff --git a/tests/test_stablediffusion.py b/tests/test_stablediffusion.py
index 99dbced4fd..53c06255eb 100644
--- a/tests/test_stablediffusion.py
+++ b/tests/test_stablediffusion.py
@@ -72,12 +72,12 @@
     },
     {
         "text_encoder": {
-            "r": 8,
+            "r": 1,
             "target_modules": ["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
             "module_dropout": 0.0,
         },
         "unet": {
-            "r": 8,
+            "r": 1,
             "target_modules": ["proj_in", "proj_out", "to_k", "to_q", "to_v", "to_out.0", "ff.net.0.proj", "ff.net.2"],
             "module_dropout": 0.0,
         },
diff --git a/tests/test_vision_models.py b/tests/test_vision_models.py
index c751390e47..f3a93dfcf0 100644
--- a/tests/test_vision_models.py
+++ b/tests/test_vision_models.py
@@ -44,7 +44,7 @@
     "lora": LoraConfig(target_modules=["convolution"], modules_to_save=["classifier", "normalization"]),
     "loha": LoHaConfig(target_modules=["convolution"], modules_to_save=["classifier", "normalization"]),
     "lokr": LoKrConfig(target_modules=["convolution"], modules_to_save=["classifier", "normalization"]),
-    "oft": OFTConfig(target_modules=["convolution"], modules_to_save=["classifier", "normalization"]),
+    "oft": OFTConfig(r=1, target_modules=["convolution"], modules_to_save=["classifier", "normalization"]),
     "hra": HRAConfig(target_modules=["convolution"], modules_to_save=["classifier", "normalization"]),
     # TODO: cannot use BOFT because some convolutional kernel dimensions are even (64) and others odd (147). There is no
     # common denominator for the boft_block_size except 1, but using 1 results in an error in the fbd_cuda kernel:
diff --git a/tests/testing_common.py b/tests/testing_common.py
index fe354edde2..860948bcfb 100644
--- a/tests/testing_common.py
+++ b/tests/testing_common.py
@@ -38,6 +38,7 @@
     LoHaConfig,
     LoKrConfig,
     LoraConfig,
+    OFTConfig,
     PeftModel,
     PeftType,
     PrefixTuningConfig,
@@ -113,6 +114,10 @@
     },
     # VBLoRA
     {"target_modules": None, "vblora_dropout": 0.05, "vector_length": 1, "num_vectors": 2},
+    # OFT
+    {
+        "target_modules": None,
+    },
 )
 
 CLASSES_MAPPING = {
@@ -127,6 +132,7 @@
     "fourierft": (FourierFTConfig, CONFIG_TESTING_KWARGS[8]),
     "hra": (HRAConfig, CONFIG_TESTING_KWARGS[9]),
     "vblora": (VBLoRAConfig, CONFIG_TESTING_KWARGS[10]),
+    "oft": (OFTConfig, CONFIG_TESTING_KWARGS[11]),
 }
 
 
@@ -646,7 +652,7 @@ def _test_merge_layers(self, model_id, config_cls, config_kwargs):
         if issubclass(config_cls, PromptLearningConfig):
             return pytest.skip(f"Test not applicable for {config_cls}")
 
-        if issubclass(config_cls, BOFTConfig):
+        if issubclass(config_cls, (OFTConfig, BOFTConfig)):
             return pytest.skip(f"Test not applicable for {config_cls}")
 
         if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig):
@@ -1106,6 +1112,10 @@ def _test_training_gradient_checkpointing(self, model_id, config_cls, config_kwa
             # TODO: no gradients on the "dense" layer, other layers work, not sure why
             self.skipTest("AdaLora with RoBERTa does not work correctly")
 
+        if (config_cls == OFTConfig) and ("deberta" in model_id.lower()):
+            # TODO: no gradients on the "dense" layer, other layers work, not sure why
+            self.skipTest("OFT with Deberta does not work correctly")
+
         model = self.transformers_class.from_pretrained(model_id)
 
         if not getattr(model, "supports_gradient_checkpointing", False):
@@ -1284,7 +1294,7 @@ def _test_unload_adapter(self, model_id, config_cls, config_kwargs):
         model = get_peft_model(model, config)
         model = model.to(self.torch_device)
 
-        if config.peft_type not in ("LORA", "ADALORA", "IA3", "BOFT", "VERA", "FOURIERFT", "HRA", "VBLORA"):
+        if config.peft_type not in ("LORA", "ADALORA", "IA3", "BOFT", "OFT", "VERA", "FOURIERFT", "HRA", "VBLORA"):
             with pytest.raises(AttributeError):
                 model = model.unload()
         else:

From ae297f07995bdf949b0989de8528092d9624780f Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Wed, 2 Oct 2024 12:43:05 +0200
Subject: [PATCH 05/22] ENH: Improved attribute access for modules_to_save
 (#2117)

Resolves #2099

So far, if a module was wrapped due to modules_to_save, we handled
access to the weight and bias attribute (albeit incorrectly in case of
disabled adapters!). However, there could be more attributes than those
that could be accessed, in which case we got an error so far.

Instead of special properties, we now implement a generic __getattr__
method that can deal with any attribute. The implementation is a bit
complex to take into account the way that torch.nn.Module handles
__getattr__.
---
 src/peft/utils/other.py | 34 ++++++++++-----
 tests/test_other.py     | 96 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+), 11 deletions(-)

diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index 9b1f474a00..c38192d8f8 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -227,17 +227,29 @@ def active_adapter(self) -> str:
         # use a property to ensure that active_adapter is not set directly, instead use the set_adapter method
         return self._active_adapter
 
-    @property
-    def weight(self):
-        if self.active_adapter not in self.modules_to_save:
-            return self.original_module.weight
-        return self.modules_to_save[self.active_adapter].weight
-
-    @property
-    def bias(self):
-        if self.active_adapter not in self.modules_to_save:
-            return self.original_module.bias
-        return self.modules_to_save[self.active_adapter].bias
+    def __getattr__(self, name: str):
+        # Note: This whole method may seem overly complex at first but PyTorch messes with __getattr__ in a way that
+        # requires very careful handling to avoid infinite recursion.
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            pass
+
+        if "_modules" not in self.__dict__:
+            raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+
+        # Could not find the attribute the PyTorch way. So let's check if it's an attribute on the
+        # original_module/modules_to_save.
+        modules = self.__dict__["_modules"]
+        if self.disable_adapters:
+            module = modules["original_module"]
+        elif self.active_adapter in modules["modules_to_save"]:
+            module = modules["modules_to_save"][self.active_adapter]
+        else:
+            # For some reason, there is no module corresponding to the active adapter; this should normally not be
+            # reached and exists as a failsafe (otherwise, a KeyError would be raised)
+            raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+        return getattr(module, name)
 
     def update(self, adapter_name):
         context_manager = nullcontext()
diff --git a/tests/test_other.py b/tests/test_other.py
index 04c67d3bf2..75d8a7565c 100644
--- a/tests/test_other.py
+++ b/tests/test_other.py
@@ -18,6 +18,7 @@
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification
 
 from peft import LoraConfig, get_peft_model
+from peft.utils.other import ModulesToSaveWrapper
 
 
 class ModelWithModuleDict(nn.Module):
@@ -103,3 +104,98 @@ def test_get_peft_model_revision_warning(tmp_path):
     overwrite_warning = f"peft config has already set base model revision to {base_revision}, overwriting with revision {overwrite_revision}"
     with pytest.warns(UserWarning, match=overwrite_warning):
         _ = get_peft_model(base_model, lora_config, revision=overwrite_revision)
+
+
+class TestModulesToSaveAttributeAccess:
+    """Test attribute accces on the ModulesToSaveWrapper class.
+
+    When we have modules_to_save, the original module is wrapped. As long as only forward was called on this wrapped
+    module, we were good. However, if, for instance, model parameters were directly accessed by another module, this
+    would typically fail, as the wrapper does not have this attribute. We had special properties for weight and bias,
+    but this is not enough. Therefore, attribute access is now transiently delegated to the active adapter (or original
+    module, if the adapter is disabled).
+
+    For one example, see #2099.
+
+    """
+
+    @pytest.fixture
+    def mlp(self):
+        class MLP(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lin0 = nn.Linear(1, 2)
+                self.lin1 = nn.Linear(3, 4)
+
+        return MLP()
+
+    def test_transient_attribute_access_default_adapter(self, mlp):
+        config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"])
+        model = get_peft_model(mlp, config)
+        assert model.lin1.weight is model.lin1.modules_to_save["default"].weight
+        assert model.lin1.bias is model.lin1.modules_to_save["default"].bias
+
+    def test_transient_attribute_access_non_default_adapter(self, mlp):
+        config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"])
+        model = get_peft_model(mlp, config)
+        model.add_adapter("other", config)
+
+        # at this point, default is still active
+        assert model.lin1.weight is model.lin1.modules_to_save["default"].weight
+        assert model.lin1.bias is model.lin1.modules_to_save["default"].bias
+        assert model.lin1.weight is not model.lin1.modules_to_save["other"].weight
+        assert model.lin1.bias is not model.lin1.modules_to_save["other"].bias
+
+        model.set_adapter("other")
+        assert model.lin1.weight is not model.lin1.modules_to_save["default"].weight
+        assert model.lin1.bias is not model.lin1.modules_to_save["default"].bias
+        assert model.lin1.weight is model.lin1.modules_to_save["other"].weight
+        assert model.lin1.bias is model.lin1.modules_to_save["other"].bias
+
+    def test_transient_attribute_access_disabled_adapter(self, mlp):
+        config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"])
+        model = get_peft_model(mlp, config)
+
+        # at this point, default is still active
+        assert model.lin1.weight is model.lin1.modules_to_save["default"].weight
+        assert model.lin1.bias is model.lin1.modules_to_save["default"].bias
+        assert model.lin1.weight is not model.lin1.original_module.weight
+        assert model.lin1.bias is not model.lin1.original_module.bias
+
+        with model.disable_adapter():
+            assert model.lin1.weight is not model.lin1.modules_to_save["default"].weight
+            assert model.lin1.bias is not model.lin1.modules_to_save["default"].bias
+            assert model.lin1.weight is model.lin1.original_module.weight
+            assert model.lin1.bias is model.lin1.original_module.bias
+
+    def test_transient_attribute_access_uninitialized_adapter(self, mlp):
+        # ensure that there is no weird infinite recursion when accessing a non-existing attribute on the class itself
+        with pytest.raises(AttributeError, match="has no attribute 'original_module'"):
+            ModulesToSaveWrapper.original_module
+
+    def test_transient_attribute_access_attr_does_not_exist_on_modules_to_save(self, mlp):
+        # ensure that there is no weird infinite recursion when accessing a non-existing attribute on the
+        # ModelToSaveWrapper instance
+        config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"])
+        model = get_peft_model(mlp, config)
+
+        with pytest.raises(AttributeError, match="has no attribute 'foo'"):
+            model.lin1.foo
+
+    def test_transient_attribute_access_attr_does_not_exist_on_original_module(self, mlp):
+        # ensure that there is no weird infinite recursion when accessing a non-existing attribute on the
+        # original module of the ModelToSaveWrapper instance
+        config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"])
+        model = get_peft_model(mlp, config)
+
+        with pytest.raises(AttributeError, match="has no attribute 'foo'"):
+            with model.disable_adapter():
+                model.lin1.foo
+
+    def test_transient_attribute_access_non_existing_adapter(self, mlp):
+        # This should normally never happen, as the active adapter should always exist, but it's a failsafe
+        config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"])
+        model = get_peft_model(mlp, config)
+        model.base_model.model.lin1._active_adapter = "does-not-exist"
+        with pytest.raises(AttributeError, match="has no attribute 'weight'"):
+            model.lin1.weight

From ca8462bb68b48d3cc613f3aafc81eb50634549d1 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Wed, 2 Oct 2024 17:27:26 +0200
Subject: [PATCH 06/22] FIX low_cpu_mem_usage consolidates devices (#2113)

See: https://github.com/huggingface/diffusers/pull/9510#issuecomment-2378316687

Right now, the low_cpu_mem_usage=True option does not consolidate the
devices. E.g. when the model is on GPU and the state_dict on CPU, the
adapter weight will be on CPU after loading, when it should be GPU. This
fix ensures that the devices are consolidated.
---
 src/peft/utils/save_and_load.py |  4 +++
 tests/test_gpu_examples.py      | 51 +++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py
index 5b40b4314c..ae210877ef 100644
--- a/src/peft/utils/save_and_load.py
+++ b/src/peft/utils/save_and_load.py
@@ -443,6 +443,10 @@ def renamed_dora_weights(k):
     )
     if low_cpu_mem_usage:
         load_result = model.load_state_dict(peft_model_state_dict, strict=False, assign=True)
+        # ensure that the correct device is set
+        for module in model.modules():
+            if hasattr(module, "_move_adapter_to_device_of_base_layer"):
+                module._move_adapter_to_device_of_base_layer(adapter_name)
     else:
         load_result = model.load_state_dict(peft_model_state_dict, strict=False)
 
diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py
index 30e50fd57e..b39bd99f5d 100644
--- a/tests/test_gpu_examples.py
+++ b/tests/test_gpu_examples.py
@@ -55,8 +55,11 @@
     PromptEncoderConfig,
     TaskType,
     get_peft_model,
+    get_peft_model_state_dict,
+    inject_adapter_in_model,
     prepare_model_for_kbit_training,
     replace_lora_weights_loftq,
+    set_peft_model_state_dict,
 )
 from peft.tuners import boft
 from peft.utils import SAFETENSORS_WEIGHTS_NAME, infer_device
@@ -3226,3 +3229,51 @@ def test_p_tuning_exactly_reproducible_after_loading(self, tmp_path):
 
         torch.testing.assert_close(output_loaded, output_peft)
         torch.testing.assert_close(gen_loaded, gen_peft)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU")
+@pytest.mark.single_gpu_tests
+class TestLowCpuMemUsageDifferentDevices:
+    """Test for the low CPU memory usage option for loading PEFT models.
+
+    There are already tests for this in test_initialization.py but here we want to specifically test diverging devices
+    for the model and state_dict.
+
+    """
+
+    model_id = "hf-internal-testing/tiny-random-OPTForCausalLM"
+
+    @pytest.mark.parametrize("device_model, device_sd", [("cpu", "cuda"), ("cuda", "cpu")])
+    def test_low_cpu_mem_usage_model_model_on_gpu_state_dict_on_cpu_works(self, device_model, device_sd):
+        inputs = {"input_ids": torch.randint(0, 100, (1, 10)), "attention_mask": torch.ones(1, 10)}
+        inputs = {k: v.to(device_model) for k, v in inputs.items()}
+
+        model = AutoModelForCausalLM.from_pretrained(self.model_id).to(device_model)
+        lora_config = LoraConfig(init_lora_weights=False, target_modules="all-linear")
+        model = get_peft_model(model, lora_config)
+        model.eval()
+        logits_not_low_cpu_mem = model(**inputs).logits
+
+        state_dict = get_peft_model_state_dict(model)
+        peft_model_state_dict = {}
+        # remap the state dict so that it can be correctly loaded, and move weights to the other device
+        prefix = "base_model.model."
+        for k, v in state_dict.items():
+            k = k[len(prefix) :]
+            peft_model_state_dict[k] = v.to(device_sd)
+
+        del model
+
+        model = AutoModelForCausalLM.from_pretrained(self.model_id).to(device_model)
+        model.eval()
+        inject_adapter_in_model(lora_config, model, low_cpu_mem_usage=True)
+        load_result = set_peft_model_state_dict(model, peft_model_state_dict, low_cpu_mem_usage=True)
+
+        # sanity check: all lora keys are matched
+        assert not any("lora" in k for k in load_result.missing_keys)
+        assert not any("lora" in k for k in load_result.unexpected_keys)
+
+        logits_low_cpu_mem = model(**inputs).logits
+
+        assert torch.allclose(logits_low_cpu_mem, logits_not_low_cpu_mem)
+        assert {p.device.type for p in model.parameters()} == {device_model}

From 534d361e7c86d72d59ab271e55e8b094c2b8c337 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Wed, 2 Oct 2024 18:31:01 +0200
Subject: [PATCH 07/22] TST Mark flaky X-LoRA test as xfail (#2114)

Currently, CI is failing constantly because one of the X-LoRA tests has
become flaky lately, most likely caused by the transformers 4.45.0
release. Therefore, this test is now marked to non-strictly xfail.

I cannot reproduce this error locally, neither on CPU nor GPU. It is
thus unclear how to fix this test.
---
 tests/test_xlora.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_xlora.py b/tests/test_xlora.py
index 7b70a4b240..e150116b06 100644
--- a/tests/test_xlora.py
+++ b/tests/test_xlora.py
@@ -188,6 +188,8 @@ def test_misc_methods(self, tokenizer, model):
 
         assert str(model) is not None
 
+    # TODO: On CI (but not locally), this test seems to have become flaky with the latest transformers changes (v4.45).
+    @pytest.mark.xfail
     def test_save_load_functional(self, tokenizer, model, tmp_path):
         inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt")
         outputs = model.generate(

From d9d3059e94eab8e4700f9889555feb8babd7d7f5 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Wed, 2 Oct 2024 18:52:00 +0200
Subject: [PATCH 08/22] ENH: Warn when from_pretrained misses PEFT keys (#2118)

After merging #2084, we now clean up the missing_keys when loading a
PEFT adapter to remove all but the relevant keys (the fact that base
model keys are missing is expected when loading a PEFT adapter).

Since the presence of missing_keys now really means that something might
have gone wrong during loading, we can now warn the user if they call
PeftModel.from_pretrained.

Note that load_adapter still does not warn, as here we return the
load_result and users can already check, but for from_pretrained, they
don't have that possibility.
---
 src/peft/peft_model.py       | 13 ++++++++++++-
 tests/test_initialization.py | 32 ++++++++++++++++++++++++++++++++
 tests/testing_common.py      |  6 +++++-
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
index fb567ebfb3..26c4cf1fdb 100644
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@@ -583,7 +583,7 @@ def from_pretrained(
                 low_cpu_mem_usage=low_cpu_mem_usage,
             )
 
-        model.load_adapter(
+        load_result = model.load_adapter(
             model_id,
             adapter_name,
             is_trainable=is_trainable,
@@ -592,6 +592,17 @@ def from_pretrained(
             **kwargs,
         )
 
+        # 1. Remove VB-LoRA vector bank, since it's a shared parameter set via the VBLoRAModel
+        # 2. Remove the prompt encoder, as it does not need to be part of the checkpoint
+        missing_keys = [
+            k for k in load_result.missing_keys if "vblora_vector_bank" not in k and "prompt_encoder" not in k
+        ]
+        if missing_keys:
+            # Let's warn here since (in contrast to load_adapter) we don't return the load result, so it could be quite
+            # difficult for users to even notice that something might have gone wrong here. As we filter out non PEFT
+            # keys from the missing keys, this gives no false positives.
+            warnings.warn(f"Found missing adapter keys while loading the checkpoint: {missing_keys}")
+
         return model
 
     def _setup_prompt_encoder(self, adapter_name: str):
diff --git a/tests/test_initialization.py b/tests/test_initialization.py
index cc54003350..7284acbb98 100644
--- a/tests/test_initialization.py
+++ b/tests/test_initialization.py
@@ -1512,3 +1512,35 @@ def test_mixed_model_load_adapter_low_cpu_mem_usage_works(self, device, inputs,
 
         assert device_set_low_cpu_mem == device_set_not_low_cpu_mem
         assert torch.allclose(logits_low_cpu_mem, logits_not_low_cpu_mem)
+
+
+def test_from_pretrained_missing_keys_warning(recwarn, tmp_path):
+    # For more context, see issue 2115
+    # When loading a PEFT adapter and we're missing a PEFT-specific weight, there should be a warning.
+    model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-OPTForCausalLM")
+    config = LoraConfig()
+    model = get_peft_model(model, config)
+    state_dict = model.state_dict()
+
+    # first, sanity check that there are no warnings if no key is missing
+    model.save_pretrained(tmp_path)
+    del model
+    model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-OPTForCausalLM")
+    model = PeftModel.from_pretrained(model, tmp_path)
+    msg = "Found missing adapter keys"
+    assert not any(msg in str(w.message) for w in recwarn.list)
+
+    # remove a key from the state_dict
+    missing_key = "base_model.model.model.decoder.layers.0.self_attn.v_proj.lora_A.default.weight"
+
+    def new_state_dict():
+        return {k: v for k, v in state_dict.items() if k != missing_key}
+
+    model.state_dict = new_state_dict
+    model.save_pretrained(tmp_path)
+    del model
+
+    model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-OPTForCausalLM")
+    model = PeftModel.from_pretrained(model, tmp_path)
+    assert any(msg in str(w.message) for w in recwarn.list)
+    assert any(missing_key in str(w.message) for w in recwarn.list)
diff --git a/tests/testing_common.py b/tests/testing_common.py
index 860948bcfb..3eec02510f 100644
--- a/tests/testing_common.py
+++ b/tests/testing_common.py
@@ -18,6 +18,7 @@
 import re
 import shutil
 import tempfile
+import warnings
 from collections import OrderedDict
 from dataclasses import replace
 
@@ -378,7 +379,10 @@ def _test_save_pretrained(self, model_id, config_cls, config_kwargs, safe_serial
                 model.save_pretrained(tmp_dirname, safe_serialization=False)
 
             model_from_pretrained = self.transformers_class.from_pretrained(model_id)
-            model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname)
+            with warnings.catch_warnings(record=True) as recs:
+                model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname)
+                # ensure that there is no warning
+                assert not any("Found missing adapter keys" in str(rec.message) for rec in recs)
 
             # check if the state dicts are equal
             if issubclass(config_cls, PromptEncoderConfig):

From 8d9ecbed080eed79f32b279bdec211767919bd94 Mon Sep 17 00:00:00 2001
From: JINO ROHIT <find.jinorohit@gmail.com>
Date: Thu, 3 Oct 2024 16:38:08 +0530
Subject: [PATCH 09/22] FEAT: Adding exclude modules param(#2044) (#2102)

Allows to exclude target modules.
---
 src/peft/tuners/adalora/config.py   |  3 ++
 src/peft/tuners/boft/config.py      | 21 ++++++--
 src/peft/tuners/fourierft/config.py | 11 ++++
 src/peft/tuners/hra/config.py       | 21 ++++++--
 src/peft/tuners/ia3/config.py       | 21 ++++++--
 src/peft/tuners/ln_tuning/config.py |  8 +++
 src/peft/tuners/loha/config.py      | 20 +++++--
 src/peft/tuners/lokr/config.py      | 23 ++++++--
 src/peft/tuners/lora/config.py      | 11 ++++
 src/peft/tuners/oft/config.py       | 11 ++++
 src/peft/tuners/poly/config.py      | 19 +++++--
 src/peft/tuners/tuners_utils.py     | 82 ++++++++++++++++++++++-------
 src/peft/tuners/vblora/config.py    | 23 ++++++--
 tests/test_tuners_utils.py          | 73 +++++++++++++++++++++++++
 14 files changed, 300 insertions(+), 47 deletions(-)

diff --git a/src/peft/tuners/adalora/config.py b/src/peft/tuners/adalora/config.py
index b508588a17..5419159397 100644
--- a/src/peft/tuners/adalora/config.py
+++ b/src/peft/tuners/adalora/config.py
@@ -61,6 +61,9 @@ def __post_init__(self):
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        self.exclude_modules = (
+            set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules
+        )
         # if target_modules is a regex expression, then layers_to_transform should be None
         if isinstance(self.target_modules, str) and self.layers_to_transform is not None:
             raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.")
diff --git a/src/peft/tuners/boft/config.py b/src/peft/tuners/boft/config.py
index ecd6a2c13c..dcae4c0841 100644
--- a/src/peft/tuners/boft/config.py
+++ b/src/peft/tuners/boft/config.py
@@ -15,8 +15,10 @@
 # The implementation is based on "Parameter-Efficient Orthogonal Finetuning
 # via Butterfly Factorization" (https://arxiv.org/abs/2311.06243) in ICLR 2024.
 
+from __future__ import annotations
+
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 from peft.config import PeftConfig
 from peft.utils import PeftType
@@ -32,6 +34,10 @@ class BOFTConfig(PeftConfig):
         boft_block_num (`int`): Number of BOFT blocks per injected layer.
         boft_n_butterfly_factor (`int`): Number of butterfly factors across different layers.
         target_modules (`Union[List[str],str]`): The names of the modules to apply the adapter to.
+        exclude_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to not apply the adapter. When passing a string, a regex match will be performed.
+            When passing a list of strings, either an exact match will be performed or it is checked if the name of the
+            module ends with any of the passed strings.
         boft_dropout (`float`):
             The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the
             dropout layer in LoRA.
@@ -76,13 +82,17 @@ class BOFTConfig(PeftConfig):
             ),
         },
     )
-    target_modules: Optional[Union[List[str], str]] = field(
+    target_modules: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
             "help": "List of module names or regex expression of the module names to replace with BOFT.",
             "example": "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' ",
         },
     )
+    exclude_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={"help": "List of module names or regex expression of the module names to exclude from BOFT."},
+    )
     boft_dropout: float = field(
         default=0.0,
         metadata={
@@ -94,7 +104,7 @@ class BOFTConfig(PeftConfig):
         metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
     )
     bias: str = field(default="none", metadata={"help": "Bias type for BOFT. Can be 'none', 'all' or 'boft_only'"})
-    modules_to_save: Optional[List[str]] = field(
+    modules_to_save: Optional[list[str]] = field(
         default=None,
         metadata={
             "help": "List of modules apart from BOFT layers to be set as trainable and saved in the final checkpoint. ",
@@ -113,7 +123,7 @@ class BOFTConfig(PeftConfig):
             ),
         },
     )
-    layers_to_transform: Optional[Union[List[int], int]] = field(
+    layers_to_transform: Optional[Union[list[int], int]] = field(
         default=None,
         metadata={
             "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
@@ -131,6 +141,9 @@ def __post_init__(self):
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        self.exclude_modules = (
+            set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules
+        )
         if self.boft_block_size == 0 and self.boft_block_num == 0:
             raise ValueError(
                 f"Either `boft_block_size` or `boft_block_num` must be non-zero. Currently, boft_block_size = {self.boft_block_size} and boft_block_num = {self.boft_block_num}."
diff --git a/src/peft/tuners/fourierft/config.py b/src/peft/tuners/fourierft/config.py
index 1816dc4b0b..1efaa22f37 100644
--- a/src/peft/tuners/fourierft/config.py
+++ b/src/peft/tuners/fourierft/config.py
@@ -52,6 +52,10 @@ class FourierFTConfig(PeftConfig):
         target_modules (`Union[list[str],str]`):
             List of module names or regex expression of the module names to replace with FourierFT. For example, ['q',
             'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. Only linear layers are supported.
+        exclude_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to not apply the adapter. When passing a string, a regex match will be performed.
+            When passing a list of strings, either an exact match will be performed or it is checked if the name of the
+            module ends with any of the passed strings.
         fan_in_fan_out (`bool`):
             Set this to True if the layer to replace stores weight like (fan_in, fan_out).
         bias (`str`):
@@ -123,6 +127,10 @@ class FourierFTConfig(PeftConfig):
             )
         },
     )
+    exclude_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={"help": "List of module names or regex expression of the module names to exclude from fourierft."},
+    )
     bias: str = field(
         default="none", metadata={"help": "Bias type for FourierFT. Can be 'none', 'all' or 'fourier_only'."}
     )
@@ -179,6 +187,9 @@ def __post_init__(self):
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        self.exclude_modules = (
+            set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules
+        )
         # if target_modules is a regex expression, then layers_to_transform should be None
         if isinstance(self.target_modules, str) and self.layers_to_transform is not None:
             raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.")
diff --git a/src/peft/tuners/hra/config.py b/src/peft/tuners/hra/config.py
index 1b5457d9af..01e90471a5 100644
--- a/src/peft/tuners/hra/config.py
+++ b/src/peft/tuners/hra/config.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 from peft.config import PeftConfig
 from peft.utils import PeftType
@@ -38,6 +40,10 @@ class HRAConfig(PeftConfig):
             the output layer. If this is not specified, modules will be chosen according to the model architecture. If
             the architecture is not known, an error will be raised -- in this case, you should specify the target
             modules manually.
+        exclude_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to not apply the adapter. When passing a string, a regex match will be performed.
+            When passing a list of strings, either an exact match will be performed or it is checked if the name of the
+            module ends with any of the passed strings.
         init_weights (`bool`):
             Whether to perform initialization of HRA weights.
         layers_to_transform (`Union[List[int], int]`):
@@ -64,13 +70,17 @@ class HRAConfig(PeftConfig):
         default=False,
         metadata={"help": "Whether to apply Gram-Schmidt orthogonalization or not."},
     )
-    target_modules: Optional[Union[List[str], str]] = field(
+    target_modules: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
             "help": "List of module names or regex expression of the module names to replace with HRA.",
             "example": "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' ",
         },
     )
+    exclude_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={"help": "List of module names or regex expression of the module names to exclude from HRA."},
+    )
     init_weights: bool = field(
         default=True,
         metadata={
@@ -80,7 +90,7 @@ class HRAConfig(PeftConfig):
             ),
         },
     )
-    layers_to_transform: Optional[Union[List[int], int]] = field(
+    layers_to_transform: Optional[Union[list[int], int]] = field(
         default=None,
         metadata={
             "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
@@ -93,7 +103,7 @@ class HRAConfig(PeftConfig):
         },
     )
     bias: str = field(default="none", metadata={"help": "Bias type for HRA. Can be 'none', 'all' or 'hra_only'"})
-    modules_to_save: Optional[List[str]] = field(
+    modules_to_save: Optional[list[str]] = field(
         default=None,
         metadata={
             "help": "List of modules apart from HRA layers to be set as trainable and saved in the final checkpoint. "
@@ -107,6 +117,9 @@ def __post_init__(self):
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        self.exclude_modules = (
+            set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules
+        )
         # if target_modules is a regex expression, then layers_to_transform should be None
         if isinstance(self.target_modules, str) and self.layers_to_transform is not None:
             raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.")
diff --git a/src/peft/tuners/ia3/config.py b/src/peft/tuners/ia3/config.py
index 322ea068d3..8d103f99d7 100644
--- a/src/peft/tuners/ia3/config.py
+++ b/src/peft/tuners/ia3/config.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 from peft.config import PeftConfig
 from peft.utils import PeftType
@@ -33,6 +35,10 @@ class IA3Config(PeftConfig):
             excluding the output layer. If this is not specified, modules will be chosen according to the model
             architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
             the target modules manually.
+        exclude_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to not apply the adapter. When passing a string, a regex match will be performed.
+            When passing a list of strings, either an exact match will be performed or it is checked if the name of the
+            module ends with any of the passed strings.
         feedforward_modules (`Optional[Union[List[str], str]]`):
             The names of the modules to be treated as feedforward modules, as in the original paper. These modules will
             have (IA)³ vectors multiplied to the input, instead of the output. `feedforward_modules` must be a name or
@@ -47,7 +53,7 @@ class IA3Config(PeftConfig):
             discouraged.
     """
 
-    target_modules: Optional[Union[List[str], str]] = field(
+    target_modules: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
             "help": (
@@ -59,7 +65,11 @@ class IA3Config(PeftConfig):
             ),
         },
     )
-    feedforward_modules: Optional[Union[List[str], str]] = field(
+    exclude_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={"help": "List of module names or regex expression of the module names to exclude from (IA)³."},
+    )
+    feedforward_modules: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
             "help": "List of module names or a regex expression of module names which are feedforward"
@@ -70,7 +80,7 @@ class IA3Config(PeftConfig):
         default=False,
         metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
     )
-    modules_to_save: Optional[List[str]] = field(
+    modules_to_save: Optional[list[str]] = field(
         default=None,
         metadata={
             "help": "List of modules apart from (IA)^3 layers to be set as trainable and saved in the final checkpoint. "
@@ -88,6 +98,9 @@ def __post_init__(self):
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        self.exclude_modules = (
+            set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules
+        )
         self.feedforward_modules = (
             set(self.feedforward_modules) if isinstance(self.feedforward_modules, list) else self.feedforward_modules
         )
diff --git a/src/peft/tuners/ln_tuning/config.py b/src/peft/tuners/ln_tuning/config.py
index fac6a633c6..a47429484e 100644
--- a/src/peft/tuners/ln_tuning/config.py
+++ b/src/peft/tuners/ln_tuning/config.py
@@ -31,6 +31,10 @@ class LNTuningConfig(PeftConfig):
             '.*decoder.*' or '.*encoder.*'. If this is not specified, modules will be chosen according to the model
             architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
             the target modules manually.
+        exclude_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to not apply the adapter. When passing a string, a regex match will be performed.
+            When passing a list of strings, either an exact match will be performed or it is checked if the name of the
+            module ends with any of the passed strings.
         modules_to_save (`Optional[Union[List[str], str]]`):
             List of modules to be set as trainable and saved in the final checkpoint. For example, in Sequence
             Classification or Token Classification tasks, the final layer `classifier/score` are randomly initialized
@@ -48,6 +52,10 @@ class LNTuningConfig(PeftConfig):
             ),
         },
     )
+    exclude_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={"help": "List of module names or regex expression of the module names to exclude from LNTuning."},
+    )
     modules_to_save: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
diff --git a/src/peft/tuners/loha/config.py b/src/peft/tuners/loha/config.py
index c38ba7828b..3f47444eff 100644
--- a/src/peft/tuners/loha/config.py
+++ b/src/peft/tuners/loha/config.py
@@ -11,9 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 from peft.tuners.lycoris_utils import LycorisConfig
 from peft.utils import PeftType
@@ -43,6 +44,10 @@ class LoHaConfig(LycorisConfig):
             excluding the output layer. If this is not specified, modules will be chosen according to the model
             architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
             the target modules manually.
+        exclude_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to not apply the adapter. When passing a string, a regex match will be performed.
+            When passing a list of strings, either an exact match will be performed or it is checked if the name of the
+            module ends with any of the passed strings.
         init_weights (`bool`):
             Whether to perform initialization of adapter weights. This defaults to `True`, passing `False` is
             discouraged.
@@ -76,7 +81,7 @@ class LoHaConfig(LycorisConfig):
             "help": 'Use parameter effective decomposition for Conv2d 3x3 with ksize > 1 ("Proposition 3" from FedPara paper)'
         },
     )
-    target_modules: Optional[Union[List[str], str]] = field(
+    target_modules: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
             "help": "List of module names or regex expression of the module names to replace with LoHa."
@@ -84,6 +89,10 @@ class LoHaConfig(LycorisConfig):
             "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
         },
     )
+    exclude_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={"help": "List of module names or regex expression of the module names to exclude from LoHa."},
+    )
     init_weights: bool = field(
         default=True,
         metadata={
@@ -93,7 +102,7 @@ class LoHaConfig(LycorisConfig):
             ),
         },
     )
-    layers_to_transform: Optional[Union[List[int], int]] = field(
+    layers_to_transform: Optional[Union[list[int], int]] = field(
         default=None,
         metadata={
             "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
@@ -105,7 +114,7 @@ class LoHaConfig(LycorisConfig):
             "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
         },
     )
-    modules_to_save: Optional[List[str]] = field(
+    modules_to_save: Optional[list[str]] = field(
         default=None,
         metadata={
             "help": "List of modules apart from LoHA layers to be set as trainable and saved in the final checkpoint. "
@@ -119,3 +128,6 @@ def __post_init__(self):
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        self.exclude_modules = (
+            set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules
+        )
diff --git a/src/peft/tuners/lokr/config.py b/src/peft/tuners/lokr/config.py
index c8d60a7463..0f8e991556 100644
--- a/src/peft/tuners/lokr/config.py
+++ b/src/peft/tuners/lokr/config.py
@@ -11,9 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 from peft.tuners.lycoris_utils import LycorisConfig
 from peft.utils import PeftType
@@ -47,6 +48,10 @@ class LoKrConfig(LycorisConfig):
             excluding the output layer. If this is not specified, modules will be chosen according to the model
             architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
             the target modules manually.
+        exclude_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to not apply the adapter. When passing a string, a regex match will be performed.
+            When passing a list of strings, either an exact match will be performed or it is checked if the name of the
+            module ends with any of the passed strings.
         init_weights (`bool`):
             Whether to perform initialization of adapter weights. This defaults to `True`, passing `False` is
             discouraged.
@@ -85,7 +90,7 @@ class LoKrConfig(LycorisConfig):
         metadata={"help": "Perform rank decomposition of left kronecker product matrix."},
     )
     decompose_factor: int = field(default=-1, metadata={"help": "Kronecker product decomposition factor."})
-    target_modules: Optional[Union[List[str], str]] = field(
+    target_modules: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
             "help": "List of module names or regex expression of the module names to replace with LoKr."
@@ -93,6 +98,10 @@ class LoKrConfig(LycorisConfig):
             "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
         },
     )
+    exclude_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={"help": "List of module names or regex expression of the module names to exclude from LoKr."},
+    )
     init_weights: bool = field(
         default=True,
         metadata={
@@ -102,7 +111,7 @@ class LoKrConfig(LycorisConfig):
             ),
         },
     )
-    layers_to_transform: Optional[Union[List[int], int]] = field(
+    layers_to_transform: Optional[Union[list[int], int]] = field(
         default=None,
         metadata={
             "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
@@ -114,7 +123,7 @@ class LoKrConfig(LycorisConfig):
             "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
         },
     )
-    modules_to_save: Optional[List[str]] = field(
+    modules_to_save: Optional[list[str]] = field(
         default=None,
         metadata={
             "help": "List of modules apart from LoKr layers to be set as trainable and saved in the final checkpoint. "
@@ -125,3 +134,9 @@ class LoKrConfig(LycorisConfig):
 
     def __post_init__(self):
         self.peft_type = PeftType.LOKR
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
+        self.exclude_modules = (
+            set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules
+        )
diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py
index 941582fe89..6fb383a274 100644
--- a/src/peft/tuners/lora/config.py
+++ b/src/peft/tuners/lora/config.py
@@ -85,6 +85,10 @@ class LoraConfig(PeftConfig):
             excluding the output layer. If this is not specified, modules will be chosen according to the model
             architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
             the target modules manually.
+        exclude_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to not apply the adapter. When passing a string, a regex match will be performed.
+            When passing a list of strings, either an exact match will be performed or it is checked if the name of the
+            module ends with any of the passed strings.
         lora_alpha (`int`):
             The alpha parameter for Lora scaling.
         lora_dropout (`float`):
@@ -166,6 +170,10 @@ class LoraConfig(PeftConfig):
             ),
         },
     )
+    exclude_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={"help": "List of module names or regex expression of the module names to exclude from Lora."},
+    )
     lora_alpha: int = field(default=8, metadata={"help": "Lora alpha"})
     lora_dropout: float = field(default=0.0, metadata={"help": "Lora dropout"})
     fan_in_fan_out: bool = field(
@@ -327,6 +335,9 @@ def __post_init__(self):
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        self.exclude_modules = (
+            set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules
+        )
         # if target_modules is a regex expression, then layers_to_transform should be None
         if isinstance(self.target_modules, str) and self.layers_to_transform is not None:
             raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.")
diff --git a/src/peft/tuners/oft/config.py b/src/peft/tuners/oft/config.py
index 13a6b5d7ce..85bfc9cc24 100644
--- a/src/peft/tuners/oft/config.py
+++ b/src/peft/tuners/oft/config.py
@@ -44,6 +44,10 @@ class OFTConfig(PeftConfig):
         bias (`str`): Bias type for OFT. Can be 'none', 'all' or 'oft_only'. If 'all' or 'oft_only', the
             corresponding biases will be updated during training. Be aware that this means that, even when disabling
             the adapters, the model will not produce the same output as the base model would have without adaptation.
+        exclude_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to not apply the adapter. When passing a string, a regex match will be performed.
+            When passing a list of strings, either an exact match will be performed or it is checked if the name of the
+            module ends with any of the passed strings.
         init_weights (`bool`):
             Whether to perform initialization of OFT weights.
         layers_to_transform (`Union[List[int], int]`):
@@ -94,6 +98,10 @@ class OFTConfig(PeftConfig):
     bias: Literal["none", "all", "oft_only"] = field(
         default="none", metadata={"help": "Bias type for OFT. Can be 'none', 'all' or 'oft_only'"}
     )
+    exclude_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={"help": "List of module names or regex expression of the module names to exclude from OFT."},
+    )
     init_weights: bool = field(
         default=True,
         metadata={
@@ -163,6 +171,9 @@ def __post_init__(self):
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        self.exclude_modules = (
+            set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules
+        )
         if self.r == 0 and self.oft_block_size == 0:
             raise ValueError(
                 f"Either `r` or `oft_block_size` must be non-zero. Currently, r = {self.r} and oft_block_size = {self.oft_block_size}."
diff --git a/src/peft/tuners/poly/config.py b/src/peft/tuners/poly/config.py
index 3abbc93b02..fea09ce0bd 100644
--- a/src/peft/tuners/poly/config.py
+++ b/src/peft/tuners/poly/config.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 from dataclasses import dataclass, field
-from typing import List, Literal, Optional, Union
+from typing import Literal, Optional, Union
 
 from peft.config import PeftConfig
 from peft.utils import PeftType
@@ -29,6 +31,10 @@ class PolyConfig(PeftConfig):
     Args:
         r (`int`): Attention dimension of each Lora in Poly.
         target_modules (`Union[List[str],str]`): The names of the modules to apply Poly to.
+        exclude_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to not apply the adapter. When passing a string, a regex match will be performed.
+            When passing a list of strings, either an exact match will be performed or it is checked if the name of the
+            module ends with any of the passed strings.
         modules_to_save (`List[str]`): List of modules apart from Poly layers to be set as trainable
             and saved in the final checkpoint.
         init_weights (bool): Whether to perform initialization of Poly weights.
@@ -41,14 +47,18 @@ class PolyConfig(PeftConfig):
     """
 
     r: int = field(default=8, metadata={"help": "Lora attention dimension"})
-    target_modules: Optional[Union[List[str], str]] = field(
+    target_modules: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
             "help": "List of module names or regex expression of the module names to replace with Poly."
             "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
         },
     )
-    modules_to_save: Optional[List[str]] = field(
+    exclude_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={"help": "List of module names or regex expression of the module names to exclude from Poly."},
+    )
+    modules_to_save: Optional[list[str]] = field(
         default=None,
         metadata={
             "help": "List of modules apart from Poly layers to be set as trainable and saved in the final checkpoint. "
@@ -87,3 +97,6 @@ def __post_init__(self):
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        self.exclude_modules = (
+            set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules
+        )
diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
index 03b8531bfd..51994b456f 100644
--- a/src/peft/tuners/tuners_utils.py
+++ b/src/peft/tuners/tuners_utils.py
@@ -422,6 +422,8 @@ def inject_adapter(
 
         """
         peft_config = self.peft_config[adapter_name]
+        excluded_modules = []
+        unmatched_modules = []
         # Note: If possible, all checks should be performed *at the start of this method*.
         # This way, we can raise early if something goes wrong, without leaving the model
         # in a bad (half-initialized) state.
@@ -435,13 +437,12 @@ def inject_adapter(
         peft_config = self._prepare_adapter_config(peft_config, model_config)
 
         self._prepare_model(peft_config, model)
-        is_target_modules_in_base_model = False
         key_list = [key for key, _ in model.named_modules()]
 
-        if getattr(peft_config, "target_modules", None) == DUMMY_TARGET_MODULES:
+        uses_dummy_target_modules = getattr(peft_config, "target_modules", None) == DUMMY_TARGET_MODULES
+        if uses_dummy_target_modules:
             # dummy adapter, we allow not matching any module
             key_list = []
-            is_target_modules_in_base_model = True
 
         # update peft_config.target_modules if required
         peft_config = _maybe_include_all_linear_layers(peft_config, model)
@@ -467,6 +468,8 @@ def inject_adapter(
                 peft_config.target_modules = new_target_modules
 
         for key in key_list:
+            if not key:
+                continue
             # Check for modules_to_save in case
             if _check_for_modules_to_save and any(
                 key.endswith(f"{module_to_save}") for module_to_save in peft_config.modules_to_save
@@ -483,15 +486,45 @@ def inject_adapter(
                 _has_modules_to_save = True
                 continue
 
-            if not self._check_target_module_exists(peft_config, key):
-                continue
-
-            self.targeted_module_names.append(key)
-            is_target_modules_in_base_model = True
-            parent, target, target_name = _get_submodules(model, key)
-            ctx = init_empty_weights if low_cpu_mem_usage else nullcontext
-            with ctx():
-                self._create_and_replace(peft_config, adapter_name, target, target_name, parent, current_key=key)
+            result = self._check_target_module_exists(peft_config, key)
+            if isinstance(result, _ExcludedModule):
+                excluded_modules.append(key)
+            elif not result:
+                unmatched_modules.append(key)
+            else:
+                self.targeted_module_names.append(key)
+                parent, target, target_name = _get_submodules(model, key)
+                ctx = init_empty_weights if low_cpu_mem_usage else nullcontext
+                with ctx():
+                    self._create_and_replace(peft_config, adapter_name, target, target_name, parent, current_key=key)
+
+        if not self.targeted_module_names and not uses_dummy_target_modules:
+            if excluded_modules and not unmatched_modules:
+                # All targeted modules were excluded
+                raise ValueError(
+                    "All modules were excluded. This is likely unintended. "
+                    "Check your `target_modules` and `exclude_modules` configuration."
+                )
+            elif not excluded_modules and unmatched_modules:
+                # None of the targeted modules matched
+                raise ValueError(
+                    f"Target modules {peft_config.target_modules} not found in the base model. "
+                    f"Please check the target modules and try again."
+                )
+            else:
+                # Some modules did not match and some matched but were excluded
+                raise ValueError(
+                    "No modules were targeted for adaptation. "
+                    "This might be caused by a combination of mismatched target modules and excluded modules. "
+                    "Please check your `target_modules` and `exclude_modules` configuration."
+                )
+
+        elif hasattr(peft_config, "exclude_modules") and peft_config.exclude_modules and not excluded_modules:
+            # exclude_modules was passed but was not used
+            warnings.warn(
+                f"You have passed exclude_modules={peft_config.exclude_modules} but no modules were excluded. "
+                "Please check that exclude_modules was set correctly."
+            )
 
         tied_target_modules = self._get_tied_target_modules(model=model)
         if tied_target_modules:
@@ -502,13 +535,6 @@ def inject_adapter(
                 "See for example https://github.com/huggingface/peft/issues/2018."
             )
 
-        # Handle X-LoRA case.
-        if not is_target_modules_in_base_model and hasattr(peft_config, "target_modules"):
-            raise ValueError(
-                f"Target modules {peft_config.target_modules} not found in the base model. "
-                f"Please check the target modules and try again."
-            )
-
         # It's important to set the adapter here (again), because otherwise it can happen that if a 2nd adapter is
         # added, and it targets different layer(s) than the first adapter (which is active), then those different
         # layers will be activated, which we don't want.
@@ -903,6 +929,15 @@ def generate_suffixes(s):
     return required_suffixes
 
 
+class _ExcludedModule:
+    """
+    A private helper method used to represent excluded modules in the check_target_module_exists function.
+    """
+
+    def __bool__(self):
+        return False
+
+
 def check_target_module_exists(config, key: str) -> bool | re.Match[str] | None:
     """A helper method to check if the passed module's key name matches any of the target modules in the adapter_config.
 
@@ -914,6 +949,15 @@ def check_target_module_exists(config, key: str) -> bool | re.Match[str] | None:
         `bool` | `re.Match[str]` | `None`: True of match object if key matches any target modules from config, False or
         None if no match found
     """
+    if hasattr(config, "exclude_modules") and config.exclude_modules:
+        if isinstance(config.exclude_modules, str):
+            if re.fullmatch(config.exclude_modules, key):
+                return _ExcludedModule()
+        elif key in config.exclude_modules:
+            return _ExcludedModule()
+        elif any(key.endswith(f".{exclude_key}") for exclude_key in config.exclude_modules):
+            return _ExcludedModule()
+
     if isinstance(config.target_modules, str):
         target_module_found = re.fullmatch(config.target_modules, key)
     elif key in config.target_modules:
diff --git a/src/peft/tuners/vblora/config.py b/src/peft/tuners/vblora/config.py
index ed2b4461d6..879f54ea11 100644
--- a/src/peft/tuners/vblora/config.py
+++ b/src/peft/tuners/vblora/config.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 from peft.config import PeftConfig
 from peft.utils import PeftType
@@ -46,6 +48,10 @@ class VBLoRAConfig(PeftConfig):
             excluding the output layer. If this is not specified, modules will be chosen according to the model
             architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
             the target modules manually.
+        exclude_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to not apply the adapter. When passing a string, a regex match will be performed.
+            When passing a list of strings, either an exact match will be performed or it is checked if the name of the
+            module ends with any of the passed strings.
         save_only_topk_weights (`bool`):
             Whether to only save the topk weights. Setting `save_only_topk_weights = True` significantly reduces
             storage space. However, models saved in this mode can be used for merging or inference only, not for
@@ -97,7 +103,7 @@ class VBLoRAConfig(PeftConfig):
             "For more details, refer to the discussion in the paper."
         },
     )
-    target_modules: Optional[Union[List[str], str]] = field(
+    target_modules: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
             "help": (
@@ -109,6 +115,10 @@ class VBLoRAConfig(PeftConfig):
             )
         },
     )
+    exclude_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={"help": "List of module names or regex expression of the module names to exclude from VBLoRA."},
+    )
     save_only_topk_weights: bool = field(
         default=False,
         metadata={
@@ -125,7 +135,7 @@ class VBLoRAConfig(PeftConfig):
         metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
     )
     bias: str = field(default="none", metadata={"help": "Bias type for VBLoRA. Can be 'none', 'all' or 'vblora_only'"})
-    modules_to_save: Optional[List[str]] = field(
+    modules_to_save: Optional[list[str]] = field(
         default=None,
         metadata={
             "help": (
@@ -155,14 +165,14 @@ class VBLoRAConfig(PeftConfig):
             ),
         },
     )
-    layers_to_transform: Optional[Union[List[int], int]] = field(
+    layers_to_transform: Optional[Union[list[int], int]] = field(
         default=None,
         metadata={
             "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index. "
             "This only works when target_modules is a list of str."
         },
     )
-    layers_pattern: Optional[Union[List[str], str]] = field(
+    layers_pattern: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
             "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
@@ -175,3 +185,6 @@ def __post_init__(self):
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        self.exclude_modules = (
+            set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules
+        )
diff --git a/tests/test_tuners_utils.py b/tests/test_tuners_utils.py
index 90dbea8d70..5e742f3c88 100644
--- a/tests/test_tuners_utils.py
+++ b/tests/test_tuners_utils.py
@@ -416,6 +416,79 @@ def test_realistic_example(self):
         assert model.targeted_module_names == expected
 
 
+class TestExcludedModuleNames(unittest.TestCase):
+    """Check that the attribute exclude_module is correctly set.
+
+    This checks LoRA and IA³, but this should be sufficient, testing all other tuners is not necessary.
+    """
+
+    def test_two_excluded_module_regex(self):
+        model = MLP()
+        model = get_peft_model(model, LoraConfig(target_modules=("lin.*"), exclude_modules="lin0"))
+        assert model.targeted_module_names == ["lin1"]
+
+    def test_two_excluded_module_list(self):
+        model = MLP()
+        model = get_peft_model(model, LoraConfig(target_modules=["lin0", "lin1"], exclude_modules="lin0"))
+        assert model.targeted_module_names == ["lin1"]
+
+    def test_multiple_excluded_modules_list(self):
+        model = MLP()
+        model = get_peft_model(model, LoraConfig(target_modules=["lin0", "lin1"], exclude_modules=["lin0"]))
+        assert model.targeted_module_names == ["lin1"]
+
+    def test_ia3_two_excluded_module_regex(self):
+        model = MLP()
+        model = get_peft_model(
+            model, IA3Config(target_modules=".*lin.*", feedforward_modules=".*lin.*", exclude_modules="lin0")
+        )
+        assert model.targeted_module_names == ["lin1"]
+
+    def test_ia3_multiple_excluded_modules_list(self):
+        model = MLP()
+        model = get_peft_model(
+            model, IA3Config(target_modules=["lin0", "lin1"], feedforward_modules=".*lin.*", exclude_modules=["lin1"])
+        )
+        assert model.targeted_module_names == ["lin0"]
+
+    def test_all_modules_excluded(self):
+        model = MLP()
+        with pytest.raises(ValueError, match="All modules were excluded"):
+            get_peft_model(
+                model,
+                LoraConfig(
+                    target_modules=["lin0", "lin1", "relu", "drop", "sm"],
+                    exclude_modules=["lin0", "lin1", "relu", "drop", "sm"],
+                ),
+            )
+
+    def test_no_modules_matched(self):
+        model = MLP()
+        with pytest.raises(ValueError, match="Target modules .* not found in the base model"):
+            get_peft_model(model, LoraConfig(target_modules=["non_existent_module"]))
+
+    def test_some_modules_excluded_some_unmatched(self):
+        model = MLP()
+        with pytest.raises(ValueError, match="No modules were targeted for adaptation"):
+            get_peft_model(model, LoraConfig(target_modules=["lin0", "non_existent_module"], exclude_modules=["lin0"]))
+
+    def test_exclude_modules_not_used(self):
+        model = MLP()
+        with pytest.warns(UserWarning, match="You have passed exclude_modules=.* but no modules were excluded"):
+            get_peft_model(model, LoraConfig(target_modules=["lin1"], exclude_modules=["non_existent_module"]))
+
+    def test_realistic_example(self):
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-BloomForCausalLM")
+        config = LoraConfig(task_type="CAUSAL_LM", exclude_modules="transformer.h.2.self_attention.query_key_value")
+        model = get_peft_model(model, config)
+        expected = [
+            f"transformer.h.{i}.self_attention.query_key_value"
+            for i in range(len(model.base_model.transformer.h))
+            if i != 2
+        ]
+        assert model.targeted_module_names == expected
+
+
 class TestModelAndLayerStatus:
     """Check the methods `get_layer_status` and `get_model_status`.`
 

From e6f927bfecba238f81e940b7f560284e5829dc2e Mon Sep 17 00:00:00 2001
From: Zeju1997 <47625089+Zeju1997@users.noreply.github.com>
Date: Mon, 7 Oct 2024 11:44:38 +0200
Subject: [PATCH 10/22] FIX BC breaking change to boft conv2d scaling variable
 (#2127)

---
 src/peft/tuners/boft/layer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/peft/tuners/boft/layer.py b/src/peft/tuners/boft/layer.py
index df99ac1bbf..b8ee6c8763 100644
--- a/src/peft/tuners/boft/layer.py
+++ b/src/peft/tuners/boft/layer.py
@@ -772,7 +772,7 @@ def update_layer(
         self.boft_R[adapter_name] = nn.Parameter(
             torch.zeros(boft_n_butterfly_factor + 1, boft_block_num, boft_block_size, boft_block_size)
         )
-        self.boft_s[adapter_name] = nn.Parameter(torch.ones(int(self.out_features), 1))
+        self.boft_s[adapter_name] = nn.Parameter(torch.ones(1, int(self.out_features)))
 
         self.reset_boft_parameters(adapter_name, init_weights)
 
@@ -881,7 +881,7 @@ def get_delta_weight(self, adapter) -> tuple[torch.Tensor, torch.Tensor]:
         """
 
         boft_R = self.boft_R[adapter]
-        boft_s = self.boft_s[adapter]
+        boft_s = self.boft_s[adapter].transpose(0, 1)
 
         N, D, H, _ = boft_R.shape
         boft_R = boft_R.view(N * D, H, H)
@@ -925,7 +925,7 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
                 if active_adapter not in self.boft_R.keys():
                     continue
                 boft_R = self.boft_R[active_adapter]
-                boft_s = self.boft_s[active_adapter]
+                boft_s = self.boft_s[active_adapter].transpose(0, 1)
                 dropout = self.boft_dropout[active_adapter]
 
                 N, D, H, _ = boft_R.shape

From 859fd880e66fd152401b48c1f8c8bacbcc576f70 Mon Sep 17 00:00:00 2001
From: Ziad Helal <z.mahmoud1999@gmail.com>
Date: Mon, 7 Oct 2024 15:00:42 +0200
Subject: [PATCH 11/22] FEAT: VeRA quantization using bitsandbytes (#2070)
 (#2076)

VeRA can now be used with 4bit and 8bit bnb quantization.
---
 docs/source/developer_guides/quantization.md |  10 +-
 docs/source/package_reference/vera.md        |   5 +-
 src/peft/helpers.py                          |   3 +-
 src/peft/tuners/vera/__init__.py             |  16 +
 src/peft/tuners/vera/bnb.py                  | 408 +++++++++++++++++++
 src/peft/tuners/vera/model.py                |  48 ++-
 tests/test_common_gpu.py                     | 135 ++++++
 tests/test_gpu_examples.py                   | 227 +++++++++++
 8 files changed, 840 insertions(+), 12 deletions(-)
 create mode 100644 src/peft/tuners/vera/bnb.py

diff --git a/docs/source/developer_guides/quantization.md b/docs/source/developer_guides/quantization.md
index 114021cafc..c0848c086f 100644
--- a/docs/source/developer_guides/quantization.md
+++ b/docs/source/developer_guides/quantization.md
@@ -187,9 +187,17 @@ peft_config = LoraConfig(...)
 quantized_model = get_peft_model(quantized_model, peft_config)
 ```
 
+## Other Supported PEFT Methods
+
+Besides LoRA, the following PEFT methods also support quantization:
+
+- **VeRA** (supports bitsandbytes quantization)
+- **AdaLoRA** (supports both bitsandbytes and GPTQ quantization)
+- **(IA)³** (supports bitsandbytes quantization)
+
 ## Next steps
 
 If you're interested in learning more about quantization, the following may be helpful:
 
-* Learn more about details about QLoRA and check out some benchmarks on its impact in the [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) blog post.
+* Learn more details about QLoRA and check out some benchmarks on its impact in the [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) blog post.
 * Read more about different quantization schemes in the Transformers [Quantization](https://hf.co/docs/transformers/main/quantization) guide.
diff --git a/docs/source/package_reference/vera.md b/docs/source/package_reference/vera.md
index 9f7bb19a38..f9ed281275 100644
--- a/docs/source/package_reference/vera.md
+++ b/docs/source/package_reference/vera.md
@@ -22,12 +22,9 @@ When saving the adapter parameters, it's possible to eschew storing the low rank
 
 To handle different shapes of adapted layers, VeRA initializes shared A and B matrices with the largest required size for each dimension. During the forward pass, submatrices A and B for a given layer are sliced out from these shared matrices and used as described in the paper. For example, adapting two linear layers of shapes (100, 20) and (80, 50) will create A and B matrices of shapes (rank, 50) and (100, rank) respectively. Then, to adapt a layer of shape (100, 20), submatrices A and B of shapes (rank, 20) and (100, rank) will be extracted.
 
-VeRA currently has the following constraints:
+VeRA currently has the following constraint:
 
 - Only `nn.Linear` layers are supported.
-- Quantized layers are not supported.
-
-If these constraints don't work for your use case, use LoRA instead.
 
 The abstract from the paper is:
 
diff --git a/src/peft/helpers.py b/src/peft/helpers.py
index 0b08a951f6..dc64486882 100644
--- a/src/peft/helpers.py
+++ b/src/peft/helpers.py
@@ -168,7 +168,8 @@ def rescale_adapter_scale(model, multiplier):
 
     Args:
         model: The model containing `LoraLayer` modules whose scaling is to be adjusted.
-        multiplier (float or int): The multiplier that rescales the `scaling` attribute. Must be of type float or int.
+        multiplier (float or int):
+            The multiplier that rescales the `scaling` attribute. Must be of type float or int.
 
     Raises:
         ValueError: If the model does not contain any `LoraLayer`
diff --git a/src/peft/tuners/vera/__init__.py b/src/peft/tuners/vera/__init__.py
index cf35881834..816a499a77 100644
--- a/src/peft/tuners/vera/__init__.py
+++ b/src/peft/tuners/vera/__init__.py
@@ -12,9 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+
 from .config import VeraConfig
 from .layer import Linear, VeraLayer
 from .model import VeraModel
 
 
 __all__ = ["VeraConfig", "VeraLayer", "Linear", "VeraModel"]
+
+
+def __getattr__(name):
+    if (name == "Linear8bitLt") and is_bnb_available():
+        from .bnb import Linear8bitLt
+
+        return Linear8bitLt
+
+    if (name == "Linear4bit") and is_bnb_4bit_available():
+        from .bnb import Linear4bit
+
+        return Linear4bit
+
+    raise AttributeError(f"module {__name__} has no attribute {name}")
diff --git a/src/peft/tuners/vera/bnb.py b/src/peft/tuners/vera/bnb.py
new file mode 100644
index 0000000000..272560d697
--- /dev/null
+++ b/src/peft/tuners/vera/bnb.py
@@ -0,0 +1,408 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Optional
+
+import bitsandbytes as bnb
+import torch
+
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft.tuners.tuners_utils import check_adapters_to_merge
+from peft.utils.integrations import dequantize_bnb_weight
+from peft.utils.other import transpose
+
+from .layer import VeraLayer
+
+
+if is_bnb_available():
+
+    class Linear8bitLt(torch.nn.Module, VeraLayer):
+        def __init__(
+            self,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
+            vera_A,
+            vera_B,
+            r: int = 0,
+            vera_dropout: float = 0.0,
+            fan_in_fan_out: bool = False,
+            init_weights: bool = True,
+            d_initial: float = 0.1,
+            **kwargs,
+        ) -> None:
+            super().__init__()
+            VeraLayer.__init__(self, base_layer)
+            self.fan_in_fan_out = fan_in_fan_out
+
+            self._active_adapter = adapter_name
+            self.update_layer(
+                adapter_name,
+                vera_A,
+                vera_B,
+                r,
+                vera_dropout=vera_dropout,
+                init_weights=init_weights,
+                d_initial=d_initial,
+            )
+
+        def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
+            if self.merged:
+                warnings.warn(
+                    f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                    f"You are now additionally merging {','.join(self.active_adapters)}."
+                )
+
+            adapter_names = check_adapters_to_merge(self, adapter_names)
+            if not adapter_names:
+                return
+
+            for active_adapter in adapter_names:
+                if active_adapter not in self.vera_lambda_d.keys():
+                    continue
+
+                warnings.warn(
+                    "Merge vera module to 8-bit linear may get different generations due to rounding errors."
+                )
+                vera_data = self.get_delta_weight(active_adapter)
+
+                weight = self.get_base_layer().weight
+                state = self.get_base_layer().state
+                if state.SCB is None:
+                    state.SCB = weight.SCB
+
+                output = dequantize_bnb_weight(weight, state)
+                w_data = output.to(vera_data.dtype).to(vera_data.device) + vera_data
+
+                if safe_merge and not torch.isfinite(w_data).all():
+                    raise ValueError(
+                        f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                    )
+
+                self.get_base_layer().weight = bnb.nn.Int8Params(
+                    w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights
+                ).to(weight.device)
+                state.reset_grads()
+                self.merged_adapters.append(active_adapter)
+
+        def unmerge(self) -> None:
+            if not self.merged:
+                warnings.warn("Already unmerged. Nothing to do")
+                return
+
+            while len(self.merged_adapters) > 0:
+                active_adapter = self.merged_adapters.pop()
+                if active_adapter not in self.vera_lambda_d.keys():
+                    continue
+                warnings.warn(
+                    "Unmerge vera module to 8-bit linear may get different generations due to rounding errors."
+                )
+                vera_data = self.get_delta_weight(active_adapter)
+
+                weight = self.get_base_layer().weight
+                state = self.get_base_layer().state
+                if state.SCB is None:
+                    state.SCB = weight.SCB
+                output = dequantize_bnb_weight(weight, state=state)
+
+                w_data = output.to(vera_data.dtype).to(vera_data.device) - vera_data
+
+                self.get_base_layer().weight = bnb.nn.Int8Params(
+                    w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights
+                ).to(weight.device)
+                state.reset_grads()
+
+        def get_delta_weight(self, adapter) -> torch.Tensor:
+            """
+            Compute the delta weight for the given adapter.
+
+            Args:
+                adapter (str): The name of the adapter for which the delta weight should be computed.
+
+            Returns:
+                torch.Tensor: The computed delta weight for the VeRA adapter.
+
+            Note:
+                This method implements the VeRA-specific weight update. Unlike LoRA, VeRA uses shared projection
+                matrices (vera_A and vera_B) across all layers, along with per-layer trainable parameters (lambda_d and
+                lambda_b).
+            """
+            # Retrieve shared projection matrices
+            vera_A = self.vera_A[adapter]
+            vera_B = self.vera_B[adapter]
+
+            # Retrieve per-layer trainable parameters
+            device = vera_B.device
+            dtype = vera_B.dtype
+
+            # In case users wants to merge the adapter weights that are in
+            # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
+            # (b)float16 because some CPUs have slow bf16/fp16 matmuls.
+            cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16)
+
+            lambda_d = self.vera_lambda_d[adapter]
+            lambda_b = self.vera_lambda_b[adapter]
+
+            if cast_to_fp32:
+                vera_A = vera_A.float()
+                vera_B = vera_B.float()
+                lambda_d = lambda_d.float()
+                lambda_b = lambda_b.float()
+
+            sliced_A = vera_A[:, : self.in_features]
+            sliced_B = vera_B[: self.out_features, :]
+            lambda_b = lambda_b.unsqueeze(-1)
+            lambda_d = lambda_d.unsqueeze(-1)
+
+            # VeRA-specific computation:
+            # 1. Apply lambda_d to the input projection (vera_A)
+            # 2. Apply lambda_b to the output projection (vera_B)
+            # 3. Compute the outer product of the scaled projections
+            output_tensor = transpose((lambda_b * sliced_B) @ (lambda_d * sliced_A), self.fan_in_fan_out)
+
+            if cast_to_fp32:
+                output_tensor = output_tensor.to(dtype=dtype)
+
+            return output_tensor
+
+        def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+            """
+            Perform the forward pass using the VeRA adapter.
+
+            Args:
+                x (torch.Tensor): Input tensor.
+
+            Returns:
+                torch.Tensor: Output tensor after applying the VeRA adaptation.
+
+            Note:
+                This method implements the VeRA-specific forward pass. It applies the shared projections (vera_A and
+                vera_B) along with the per-layer trainable parameters (lambda_d and lambda_b) to compute the adapter
+                output.
+            """
+            if self.disable_adapters:
+                if self.merged:
+                    self.unmerge()
+                result = self.base_layer(x, *args, **kwargs)
+            elif self.merged:
+                result = self.base_layer(x, *args, **kwargs)
+            else:
+                result = self.base_layer(x, *args, **kwargs)
+                for active_adapter in self.active_adapters:
+                    if active_adapter not in self.vera_lambda_d.keys():
+                        continue
+
+                    lambda_d = self.vera_lambda_d[active_adapter]
+                    lambda_b = self.vera_lambda_b[active_adapter]
+
+                    vera_A = self.vera_A[active_adapter]
+                    vera_B = self.vera_B[active_adapter]
+
+                    dropout = self.vera_dropout[active_adapter]
+
+                    requires_conversion = not torch.is_autocast_enabled()
+                    if requires_conversion:
+                        expected_dtype = result.dtype
+                        compute_dtype = lambda_d.dtype
+                        if x.dtype != compute_dtype:
+                            x = x.to(compute_dtype)
+
+                    sliced_A = vera_A[:, : self.in_features]
+                    sliced_B = vera_B[: self.out_features, :]
+
+                    x_temp = dropout(x.to(lambda_d.dtype))
+
+                    adapter_output = lambda_b * torch.nn.functional.linear(
+                        lambda_d * torch.nn.functional.linear(x_temp, sliced_A), sliced_B
+                    )
+
+                    if requires_conversion:
+                        adapter_output = adapter_output.to(expected_dtype)
+
+                    result = result + adapter_output
+
+            # Ensure the output tensor has the same dtype as the input tensor
+            return result.to(x.dtype)
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "vera." + rep
+
+
+if is_bnb_4bit_available():
+
+    class Linear4bit(torch.nn.Module, VeraLayer):
+        def __init__(
+            self,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
+            vera_A,
+            vera_B,
+            r: int = 0,
+            vera_dropout: float = 0.0,
+            fan_in_fan_out: bool = False,
+            init_weights: bool = True,
+            d_initial: float = 0.1,
+            **kwargs,
+        ) -> None:
+            super().__init__()
+            VeraLayer.__init__(self, base_layer)
+            self.fan_in_fan_out = fan_in_fan_out
+
+            self._active_adapter = adapter_name
+            self.update_layer(
+                adapter_name,
+                vera_A,
+                vera_B,
+                r,
+                vera_dropout=vera_dropout,
+                init_weights=init_weights,
+                d_initial=d_initial,
+            )
+
+        def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
+            if self.merged:
+                warnings.warn(
+                    f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                    f"You are now additionally merging {','.join(self.active_adapters)}."
+                )
+
+            adapter_names = check_adapters_to_merge(self, adapter_names)
+            if not adapter_names:
+                return
+
+            for active_adapter in adapter_names:
+                if active_adapter not in self.vera_lambda_d.keys():
+                    continue
+
+                warnings.warn(
+                    "Merge vera module to 4-bit linear may get different generations due to rounding errors."
+                )
+                vera_data = self.get_delta_weight(active_adapter)
+
+                weight = self.get_base_layer().weight
+                kwargs = weight.__dict__
+                w_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) + vera_data
+
+                if safe_merge and not torch.isfinite(w_data).all():
+                    raise ValueError(
+                        f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                    )
+
+                self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to(
+                    weight.device
+                )
+                self.merged_adapters.append(active_adapter)
+
+        def unmerge(self) -> None:
+            if not self.merged:
+                warnings.warn("Already unmerged. Nothing to do")
+                return
+
+            while len(self.merged_adapters) > 0:
+                active_adapter = self.merged_adapters.pop()
+                if active_adapter not in self.vera_lambda_d.keys():
+                    continue
+                warnings.warn(
+                    "Unmerge vera module to 4-bit linear may get different generations due to rounding errors."
+                )
+                vera_data = self.get_delta_weight(active_adapter)
+
+                weight = self.get_base_layer().weight
+                kwargs = weight.__dict__
+                w_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) - vera_data
+
+                self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to(
+                    weight.device
+                )
+
+        def get_delta_weight(self, adapter) -> torch.Tensor:
+            vera_A = self.vera_A[adapter]
+            vera_B = self.vera_B[adapter]
+
+            device = vera_B.device
+            dtype = vera_B.dtype
+
+            cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16)
+
+            lambda_d = self.vera_lambda_d[adapter]
+            lambda_b = self.vera_lambda_b[adapter]
+
+            if cast_to_fp32:
+                vera_A = vera_A.float()
+                vera_B = vera_B.float()
+                lambda_d = lambda_d.float()
+                lambda_b = lambda_b.float()
+
+            sliced_A = vera_A[:, : self.in_features]
+            sliced_B = vera_B[: self.out_features, :]
+            lambda_b = lambda_b.unsqueeze(-1)
+            lambda_d = lambda_d.unsqueeze(-1)
+
+            output_tensor = transpose((lambda_b * sliced_B) @ (lambda_d * sliced_A), self.fan_in_fan_out)
+
+            if cast_to_fp32:
+                output_tensor = output_tensor.to(dtype=dtype)
+
+            return output_tensor
+
+        def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+            if self.disable_adapters:
+                if self.merged:
+                    self.unmerge()
+                result = self.base_layer(x, *args, **kwargs)
+            elif self.merged:
+                result = self.base_layer(x, *args, **kwargs)
+            else:
+                result = self.base_layer(x, *args, **kwargs)
+                result = result.clone()
+                for active_adapter in self.active_adapters:
+                    if active_adapter not in self.vera_lambda_d.keys():
+                        continue
+
+                    lambda_d = self.vera_lambda_d[active_adapter]
+                    lambda_b = self.vera_lambda_b[active_adapter]
+
+                    vera_A = self.vera_A[active_adapter]
+                    vera_B = self.vera_B[active_adapter]
+
+                    dropout = self.vera_dropout[active_adapter]
+
+                    requires_conversion = not torch.is_autocast_enabled()
+                    if requires_conversion:
+                        expected_dtype = result.dtype
+                        compute_dtype = lambda_d.dtype
+                        if x.dtype != compute_dtype:
+                            x = x.to(compute_dtype)
+
+                    sliced_A = vera_A[:, : self.in_features]
+                    sliced_B = vera_B[: self.out_features, :]
+
+                    x_temp = dropout(x.to(lambda_d.dtype))
+
+                    adapter_output = lambda_b * torch.nn.functional.linear(
+                        lambda_d * torch.nn.functional.linear(x_temp, sliced_A), sliced_B
+                    )
+
+                    if requires_conversion:
+                        adapter_output = adapter_output.to(expected_dtype)
+
+                    result = result + adapter_output
+
+            # Ensure the output tensor has the same dtype as the input tensor
+            return result.to(x.dtype)
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "vera." + rep
diff --git a/src/peft/tuners/vera/model.py b/src/peft/tuners/vera/model.py
index d268d2ae0a..f863a074e6 100644
--- a/src/peft/tuners/vera/model.py
+++ b/src/peft/tuners/vera/model.py
@@ -26,6 +26,7 @@
 from tqdm import tqdm
 from transformers.pytorch_utils import Conv1D
 
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
 from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
 from peft.utils import (
     TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING,
@@ -119,10 +120,11 @@ def _find_dim(self, config) -> tuple[int, int]:
             if not self._check_target_module_exists(peft_config, key):
                 continue
 
-            if isinstance(module, (nn.Linear, Conv1D)):
-                module_shape = tuple(module.weight.shape)
-                if isinstance(module, Conv1D):
-                    module_shape = module_shape[::-1]
+            if isinstance(module, nn.Linear):
+                module_shape = module.out_features, module.in_features
+            elif isinstance(module, Conv1D):
+                module_shape = module.weight.ds_shape if hasattr(module.weight, "ds_shape") else module.weight.shape
+                module_shape = module_shape[::-1]
             else:
                 continue
 
@@ -150,6 +152,7 @@ def _init_vera_A_vera_B(self, config: VeraConfig, adapter_name: str) -> None:
         generator = torch.Generator(device="cpu").manual_seed(config.projection_prng_key)
         vera_A = _kaiming_init((config.r, linear_in_dim), generator=generator)
         vera_B = _kaiming_init((linear_out_dim, config.r), generator=generator)
+
         self.vera_A[adapter_name] = vera_A
         self.vera_B[adapter_name] = vera_B
 
@@ -214,9 +217,10 @@ def _create_and_replace(
             "vera_dropout": vera_config.vera_dropout,
             "fan_in_fan_out": vera_config.fan_in_fan_out,
             "init_weights": vera_config.init_weights,
+            "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False),
+            "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False),
         }
         kwargs["bias"] = bias
-        # TODO: add quantization support
 
         if isinstance(target, Linear):
             target.update_layer(
@@ -287,14 +291,46 @@ def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
 
     @staticmethod
     def _create_new_module(vera_config, vera_A, vera_B, adapter_name, target, **kwargs):
+        # avoid eager bnb import
+        if is_bnb_available():
+            import bitsandbytes as bnb
+
+            from .bnb import Linear8bitLt
+
+        if is_bnb_4bit_available():
+            from .bnb import Linear4bit
+
         bias = kwargs.pop("bias", False)
+        loaded_in_8bit = kwargs.get("loaded_in_8bit", False)
+        loaded_in_4bit = kwargs.get("loaded_in_4bit", False)
 
         if isinstance(target, BaseTunerLayer):
             target_base_layer = target.get_base_layer()
         else:
             target_base_layer = target
 
-        if isinstance(target_base_layer, torch.nn.Linear):
+        if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt):
+            eightbit_kwargs = kwargs.copy()
+            eightbit_kwargs.update(
+                {
+                    "has_fp16_weights": target_base_layer.state.has_fp16_weights,
+                    "memory_efficient_backward": target_base_layer.state.memory_efficient_backward,
+                    "threshold": target_base_layer.state.threshold,
+                    "index": target_base_layer.index,
+                }
+            )
+            return Linear8bitLt(target, adapter_name, vera_A, vera_B, **eightbit_kwargs)
+        elif loaded_in_4bit and isinstance(target_base_layer, bnb.nn.Linear4bit):
+            fourbit_kwargs = kwargs.copy()
+            fourbit_kwargs.update(
+                {
+                    "compute_dtype": target_base_layer.compute_dtype,
+                    "compress_statistics": target_base_layer.weight.compress_statistics,
+                    "quant_type": target_base_layer.weight.quant_type,
+                }
+            )
+            return Linear4bit(target, adapter_name, vera_A, vera_B, **fourbit_kwargs)
+        elif isinstance(target_base_layer, torch.nn.Linear):
             if kwargs["fan_in_fan_out"]:
                 warnings.warn(
                     "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
diff --git a/tests/test_common_gpu.py b/tests/test_common_gpu.py
index 970306c081..54a3758fdc 100644
--- a/tests/test_common_gpu.py
+++ b/tests/test_common_gpu.py
@@ -71,10 +71,12 @@
 
     from peft.tuners.ia3 import Linear8bitLt as IA3Linear8bitLt
     from peft.tuners.lora import Linear8bitLt as LoraLinear8bitLt
+    from peft.tuners.vera import Linear8bitLt as VeraLinear8bitLt
 
     if is_bnb_4bit_available():
         from peft.tuners.ia3 import Linear4bit as IA3Linear4bit
         from peft.tuners.lora import Linear4bit as LoraLinear4bit
+        from peft.tuners.vera import Linear4bit as VeraLinear4bit
 
 
 @require_non_cpu
@@ -148,6 +150,54 @@ def test_lora_bnb_8bit_quantization(self):
         whisper_8bit = get_peft_model(whisper_8bit, config)
         assert isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt)
 
+    @require_bitsandbytes
+    @pytest.mark.multi_gpu_tests
+    @pytest.mark.single_gpu_tests
+    def test_vera_bnb_8bit_quantization(self):
+        r"""
+        Test that tests if the 8bit quantization using VeRA works as expected
+        """
+        whisper_8bit = WhisperForConditionalGeneration.from_pretrained(
+            self.audio_model_id,
+            device_map="auto",
+            quantization_config=BitsAndBytesConfig(load_in_8bit=True),
+        )
+
+        opt_8bit = AutoModelForCausalLM.from_pretrained(
+            self.causal_lm_model_id,
+            device_map="auto",
+            quantization_config=BitsAndBytesConfig(load_in_8bit=True),
+        )
+
+        flan_8bit = AutoModelForSeq2SeqLM.from_pretrained(
+            self.seq2seq_model_id,
+            device_map="auto",
+            quantization_config=BitsAndBytesConfig(load_in_8bit=True),
+        )
+
+        flan_vera_config = VeraConfig(
+            r=16, target_modules=["q", "v"], vera_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM"
+        )
+
+        opt_vera_config = VeraConfig(
+            r=16,
+            target_modules=["q_proj", "v_proj"],
+            vera_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+
+        config = VeraConfig(r=32, target_modules=["q_proj", "v_proj"], vera_dropout=0.05, bias="none")
+
+        flan_8bit = get_peft_model(flan_8bit, flan_vera_config)
+        assert isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, VeraLinear8bitLt)
+
+        opt_8bit = get_peft_model(opt_8bit, opt_vera_config)
+        assert isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, VeraLinear8bitLt)
+
+        whisper_8bit = get_peft_model(whisper_8bit, config)
+        assert isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, VeraLinear8bitLt)
+
     @require_bitsandbytes
     @pytest.mark.multi_gpu_tests
     @pytest.mark.single_gpu_tests
@@ -259,6 +309,43 @@ def test_adalora_bnb_quantization_from_pretrained_safetensors(self, quantization
             assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A
             assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A
 
+    @require_bitsandbytes
+    @pytest.mark.multi_gpu_tests
+    @pytest.mark.single_gpu_tests
+    @parameterized.expand(["4bit", "8bit"])
+    def test_vera_bnb_quantization_from_pretrained_safetensors(self, quantization):
+        r"""
+        Tests that the bnb quantization using VeRA works as expected with safetensors weights.
+        """
+        model_id = "facebook/opt-350m"
+        kwargs = {"device_map": "auto"}
+        if quantization == "4bit":
+            kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True)
+        else:
+            kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
+
+        model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
+        config = VeraConfig(task_type=TaskType.CAUSAL_LM)
+        peft_model = get_peft_model(model, config)
+        peft_model = prepare_model_for_kbit_training(peft_model)
+        peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0))
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            peft_model.save_pretrained(tmp_dir)
+            model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
+            model = PeftModel.from_pretrained(model, tmp_dir)
+            model = prepare_model_for_kbit_training(model)
+            model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0))
+
+            # loading a 2nd adapter works, #1239
+            model.load_adapter(tmp_dir, "adapter2")
+            model.set_adapter("adapter2")
+            model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0))
+
+            # check that both adapters are in the same layer
+            assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.vera_A
+            assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.vera_A
+
     @require_bitsandbytes
     @pytest.mark.multi_gpu_tests
     @pytest.mark.single_gpu_tests
@@ -383,6 +470,54 @@ def test_lora_bnb_4bit_quantization(self):
         whisper_4bit = get_peft_model(whisper_4bit, config)
         assert isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit)
 
+    @require_bitsandbytes
+    @pytest.mark.multi_gpu_tests
+    @pytest.mark.single_gpu_tests
+    def test_vera_bnb_4bit_quantization(self):
+        r"""
+        Test that tests if the 4bit quantization using VeRA works as expected
+        """
+        whisper_4bit = WhisperForConditionalGeneration.from_pretrained(
+            self.audio_model_id,
+            device_map="auto",
+            quantization_config=BitsAndBytesConfig(load_in_4bit=True),
+        )
+
+        opt_4bit = AutoModelForCausalLM.from_pretrained(
+            self.causal_lm_model_id,
+            device_map="auto",
+            quantization_config=BitsAndBytesConfig(load_in_4bit=True),
+        )
+
+        flan_4bit = AutoModelForSeq2SeqLM.from_pretrained(
+            self.seq2seq_model_id,
+            device_map="auto",
+            quantization_config=BitsAndBytesConfig(load_in_4bit=True),
+        )
+
+        flan_vera_config = VeraConfig(
+            r=16, target_modules=["q", "v"], vera_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM"
+        )
+
+        opt_vera_config = VeraConfig(
+            r=16,
+            target_modules=["q_proj", "v_proj"],
+            vera_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+
+        config = VeraConfig(r=32, target_modules=["q_proj", "v_proj"], vera_dropout=0.05, bias="none")
+
+        flan_4bit = get_peft_model(flan_4bit, flan_vera_config)
+        assert isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, VeraLinear4bit)
+
+        opt_4bit = get_peft_model(opt_4bit, opt_vera_config)
+        assert isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, VeraLinear4bit)
+
+        whisper_4bit = get_peft_model(whisper_4bit, config)
+        assert isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, VeraLinear4bit)
+
     @require_bitsandbytes
     @pytest.mark.multi_gpu_tests
     @pytest.mark.single_gpu_tests
diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py
index b39bd99f5d..aa3a6e5005 100644
--- a/tests/test_gpu_examples.py
+++ b/tests/test_gpu_examples.py
@@ -54,6 +54,7 @@
     PeftModel,
     PromptEncoderConfig,
     TaskType,
+    VeraConfig,
     get_peft_model,
     get_peft_model_state_dict,
     inject_adapter_in_model,
@@ -1133,6 +1134,232 @@ def test_initialize_dora_with_bnb_on_cpu(self, kbit):
         weights_not_cpu = [name for name, p in peft_model.named_parameters() if p.device != torch.device("cpu")]
         assert not weights_not_cpu
 
+    @pytest.mark.single_gpu_tests
+    def test_causal_lm_training_vera(self):
+        r"""
+        Same as test_causal_lm_training but with VeRA
+        """
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = AutoModelForCausalLM.from_pretrained(
+                self.causal_lm_model_id,
+                quantization_config=BitsAndBytesConfig(load_in_8bit=True),
+                device_map="auto",
+            )
+
+            tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id)
+            model = prepare_model_for_kbit_training(model)
+
+            config = VeraConfig(
+                r=16,
+                target_modules=["q_proj", "v_proj"],
+                vera_dropout=0.05,
+                bias="none",
+                task_type="CAUSAL_LM",
+            )
+
+            model = get_peft_model(model, config)
+
+            data = load_dataset("ybelkada/english_quotes_copy")
+            data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    fp16=True,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
+            )
+            model.config.use_cache = False
+            trainer.train()
+
+            model.cpu().save_pretrained(tmp_dir)
+
+            assert "adapter_config.json" in os.listdir(tmp_dir)
+            assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir)
+
+            # assert loss is not None
+            assert trainer.state.log_history[-1]["train_loss"] is not None
+
+    @pytest.mark.single_gpu_tests
+    def test_causal_lm_training_4bit_vera(self):
+        r"""
+        Same as test_causal_lm_training_4bit but with VeRA
+        """
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = AutoModelForCausalLM.from_pretrained(
+                self.causal_lm_model_id,
+                quantization_config=BitsAndBytesConfig(load_in_4bit=True),
+                device_map="auto",
+            )
+
+            tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id)
+            model = prepare_model_for_kbit_training(model)
+
+            config = VeraConfig(
+                r=16,
+                target_modules=["q_proj", "v_proj"],
+                vera_dropout=0.05,
+                bias="none",
+                task_type="CAUSAL_LM",
+            )
+
+            model = get_peft_model(model, config)
+
+            data = load_dataset("ybelkada/english_quotes_copy")
+            data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    fp16=True,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
+            )
+            model.config.use_cache = False
+            trainer.train()
+
+            model.cpu().save_pretrained(tmp_dir)
+
+            assert "adapter_config.json" in os.listdir(tmp_dir)
+            assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir)
+
+            # assert loss is not None
+            assert trainer.state.log_history[-1]["train_loss"] is not None
+
+    @pytest.mark.multi_gpu_tests
+    def test_causal_lm_training_multi_gpu_vera(self):
+        r"""
+        Same as test_causal_lm_training_multi_gpu but with VeRA
+        """
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = AutoModelForCausalLM.from_pretrained(
+                self.causal_lm_model_id,
+                device_map="auto",
+                quantization_config=BitsAndBytesConfig(load_in_8bit=True),
+            )
+
+            assert set(model.hf_device_map.values()) == set(range(torch.cuda.device_count()))
+
+            model = prepare_model_for_kbit_training(model)
+
+            setattr(model, "model_parallel", True)
+            setattr(model, "is_parallelizable", True)
+
+            config = VeraConfig(
+                r=16,
+                target_modules=["q_proj", "v_proj"],
+                vera_dropout=0.05,
+                bias="none",
+                task_type="CAUSAL_LM",
+            )
+
+            model = get_peft_model(model, config)
+
+            data = load_dataset("Abirate/english_quotes")
+            data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True)
+
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    fp16=True,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
+            )
+            model.config.use_cache = False
+            trainer.train()
+
+            model.cpu().save_pretrained(tmp_dir)
+
+            assert "adapter_config.json" in os.listdir(tmp_dir)
+            assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir)
+
+            # assert loss is not None
+            assert trainer.state.log_history[-1]["train_loss"] is not None
+
+    @pytest.mark.multi_gpu_tests
+    def test_causal_lm_training_multi_gpu_4bit_vera(self):
+        r"""
+        Same as test_causal_lm_training_multi_gpu_4bit but with VeRA
+        """
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = AutoModelForCausalLM.from_pretrained(
+                self.causal_lm_model_id,
+                device_map="auto",
+                quantization_config=BitsAndBytesConfig(load_in_4bit=True),
+            )
+
+            assert set(model.hf_device_map.values()) == set(range(torch.cuda.device_count()))
+
+            model = prepare_model_for_kbit_training(model)
+
+            setattr(model, "model_parallel", True)
+            setattr(model, "is_parallelizable", True)
+
+            config = VeraConfig(
+                r=16,
+                target_modules=["q_proj", "v_proj"],
+                vera_dropout=0.05,
+                bias="none",
+                task_type="CAUSAL_LM",
+            )
+
+            model = get_peft_model(model, config)
+
+            data = load_dataset("Abirate/english_quotes")
+            data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True)
+
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    fp16=True,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
+            )
+            model.config.use_cache = False
+            trainer.train()
+
+            model.cpu().save_pretrained(tmp_dir)
+
+            assert "adapter_config.json" in os.listdir(tmp_dir)
+            assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir)
+
+            # assert loss is not None
+            assert trainer.state.log_history[-1]["train_loss"] is not None
+
 
 @require_torch_gpu
 @require_auto_gptq

From 5e91b546354159a143b1c71dc6bfc15569e067ad Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Tue, 8 Oct 2024 16:36:34 +0200
Subject: [PATCH 12/22] Bump version to 0.13.2.dev0 (#2137)

---
 setup.py             | 2 +-
 src/peft/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 8185e960c9..d1d4adbc3e 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 from setuptools import find_packages, setup
 
 
-VERSION = "0.13.1.dev0"
+VERSION = "0.13.2.dev0"
 
 extras = {}
 extras["quality"] = [
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 97205771e7..32d7a1a43d 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.13.1.dev0"
+__version__ = "0.13.2.dev0"
 
 from .auto import (
     AutoPeftModel,

From 9918977ecf23d3840ba50cd876f8b4a08f742de3 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Tue, 8 Oct 2024 18:10:19 +0200
Subject: [PATCH 13/22] FEAT: Support torchao (#2062)

Supports torch AO quantization. Currently supported:

- int8_weight_only
- int8_dynamic_activation_int8_weight

---------

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 docker/peft-gpu/Dockerfile                    |   1 +
 docs/source/developer_guides/quantization.md  |  24 +
 ...LoRA-torchao-8bit-dynamic-activation.ipynb | 526 ++++++++++++++++++
 .../LoRA-torchao-8bit.ipynb                   | 526 ++++++++++++++++++
 src/peft/import_utils.py                      |  16 +
 src/peft/tuners/lora/model.py                 |   9 +
 src/peft/tuners/lora/torchao.py               | 146 +++++
 src/peft/utils/integrations.py                |   4 +
 src/peft/utils/other.py                       |  16 +-
 tests/test_gpu_examples.py                    | 370 +++++++++++-
 tests/testing_utils.py                        |   8 +
 11 files changed, 1642 insertions(+), 4 deletions(-)
 create mode 100644 examples/sequence_classification/LoRA-torchao-8bit-dynamic-activation.ipynb
 create mode 100644 examples/sequence_classification/LoRA-torchao-8bit.ipynb
 create mode 100644 src/peft/tuners/lora/torchao.py

diff --git a/docker/peft-gpu/Dockerfile b/docker/peft-gpu/Dockerfile
index 0757776455..dc7ce458f6 100644
--- a/docker/peft-gpu/Dockerfile
+++ b/docker/peft-gpu/Dockerfile
@@ -62,6 +62,7 @@ RUN source activate peft && \
     librosa \
     "soundfile>=0.12.1" \
     scipy \
+    torchao \
     git+https://github.com/huggingface/transformers \
     git+https://github.com/huggingface/accelerate \
     peft[test]@git+https://github.com/huggingface/peft
diff --git a/docs/source/developer_guides/quantization.md b/docs/source/developer_guides/quantization.md
index c0848c086f..1d0271ba90 100644
--- a/docs/source/developer_guides/quantization.md
+++ b/docs/source/developer_guides/quantization.md
@@ -187,6 +187,30 @@ peft_config = LoraConfig(...)
 quantized_model = get_peft_model(quantized_model, peft_config)
 ```
 
+## torchao (PyTorch Architecture Optimization)
+
+PEFT supports models quantized with [torchao](https://github.com/pytorch/ao) ("ao") for int8 quantization.
+
+```python
+from peft import LoraConfig, get_peft_model
+from transformers import AutoModelForCausalLM, TorchAoConfig
+
+model_id = ...
+quantization_config = TorchAoConfig(quant_type="int8_weight_only")
+base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
+peft_config = LoraConfig(...)
+model = get_peft_model(base_model, peft_config)
+```
+
+### Caveats:
+
+- Use the most recent versions of torchao (>= v0.4.0) and transformers (> 4.42).
+- Only linear layers are currently supported.
+- `quant_type = "int4_weight_only"` is currently not supported.
+- `NF4` is not implemented in transformers as of yet and is thus also not supported.
+- DoRA only works with `quant_type = "int8_weight_only"` at the moment.
+- There is explicit support for torchao when used with LoRA. However, when torchao quantizes a layer, its class does not change, only the type of the underlying tensor. For this reason, PEFT methods other than LoRA will generally also work with torchao, even if not explicitly supported. Be aware, however, that **merging only works correctly with LoRA and with `quant_type = "int8_weight_only"`**. If you use a different PEFT method or dtype, merging will likely result in an error, and even it doesn't, the results will still be incorrect.
+
 ## Other Supported PEFT Methods
 
 Besides LoRA, the following PEFT methods also support quantization:
diff --git a/examples/sequence_classification/LoRA-torchao-8bit-dynamic-activation.ipynb b/examples/sequence_classification/LoRA-torchao-8bit-dynamic-activation.ipynb
new file mode 100644
index 0000000000..9bf512ea32
--- /dev/null
+++ b/examples/sequence_classification/LoRA-torchao-8bit-dynamic-activation.ipynb
@@ -0,0 +1,526 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "900b542d-0249-453c-a915-a061b80af69f",
+   "metadata": {},
+   "source": [
+    "# PyTorch AO (torchao) with int8_dynamic_activation_int8_weight"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10e1acc3-50b8-4d40-bdf3-0133c113cc4b",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a9935ae2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import argparse\n",
+    "import os\n",
+    "\n",
+    "import torch\n",
+    "from torch.optim import AdamW\n",
+    "from torch.utils.data import DataLoader\n",
+    "from peft import (\n",
+    "    get_peft_config,\n",
+    "    get_peft_model,\n",
+    "    get_peft_model_state_dict,\n",
+    "    set_peft_model_state_dict,\n",
+    "    LoraConfig,\n",
+    "    PeftType,\n",
+    "    PrefixTuningConfig,\n",
+    "    PromptEncoderConfig,\n",
+    ")\n",
+    "\n",
+    "import evaluate\n",
+    "from datasets import load_dataset\n",
+    "from transformers import AutoModelForSequenceClassification, AutoTokenizer, TorchAoConfig, get_linear_schedule_with_warmup, set_seed\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eafdd532-b1eb-4aac-8077-3386a84c7cdb",
+   "metadata": {},
+   "source": [
+    "## Parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e3b13308",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 16\n",
+    "model_name_or_path = \"google/gemma-2-2b\"\n",
+    "task = \"mrpc\"\n",
+    "device = \"cuda\"\n",
+    "num_epochs = 5\n",
+    "lr = 2e-5\n",
+    "\n",
+    "lora_rank = 16\n",
+    "lora_alpha = 32\n",
+    "lora_dropout = 0.1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c7fb69bf-0182-4111-b715-e2e659b42b1d",
+   "metadata": {},
+   "source": [
+    "## Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d2f4d25e-30b9-431f-95c3-adb390dc6fcd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if any(k in model_name_or_path for k in (\"gpt\", \"opt\", \"bloom\")):\n",
+    "    padding_side = \"left\"\n",
+    "else:\n",
+    "    padding_side = \"right\"\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)\n",
+    "if getattr(tokenizer, \"pad_token_id\") is None:\n",
+    "    tokenizer.pad_token_id = tokenizer.eos_token_id\n",
+    "\n",
+    "datasets = load_dataset(\"glue\", task)\n",
+    "metric = evaluate.load(\"glue\", task)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1ea852bc-a040-4244-8fd3-516307cecd14",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_function(examples):\n",
+    "    # max_length=None => use the model max length (it's actually the default)\n",
+    "    outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=None)\n",
+    "    return outputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "cf5ef289-f42f-4582-bd5e-9852ad8beff2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenized_datasets = datasets.map(\n",
+    "    tokenize_function,\n",
+    "    batched=True,\n",
+    "    remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n",
+    ")\n",
+    "\n",
+    "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n",
+    "# transformers library\n",
+    "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "739b3655-9db0-48bc-8542-308c6d5e0b8b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def collate_fn(examples):\n",
+    "    return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0288f311-8475-4a0e-99af-e4b909d10e01",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Instantiate dataloaders.\n",
+    "train_dataloader = DataLoader(\n",
+    "    tokenized_datasets[\"train\"],\n",
+    "    shuffle=True,\n",
+    "    collate_fn=collate_fn,\n",
+    "    batch_size=batch_size,\n",
+    ")\n",
+    "eval_dataloader = DataLoader(\n",
+    "    tokenized_datasets[\"validation\"],\n",
+    "    shuffle=False,\n",
+    "    collate_fn=collate_fn,\n",
+    "    batch_size=batch_size,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fcaf6f9e-c9d1-445a-9f08-18ef462f67ce",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e5dfff56-ea80-4561-aeaf-43216bbb9af7",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2ac42f98e60d412496fe77ed7eb5c6df",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "quant_config = TorchAoConfig(quant_type=\"int8_dynamic_activation_int8_weight\")\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\n",
+    "    model_name_or_path, return_dict=True, device_map=0, torch_dtype=torch.bfloat16, quantization_config=quant_config\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "0526f571",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "peft_config = LoraConfig(\n",
+    "    task_type=\"SEQ_CLS\",\n",
+    "    r=lora_rank,\n",
+    "    lora_alpha=lora_alpha,\n",
+    "    lora_dropout=lora_dropout,\n",
+    "    target_modules=[\"q_proj\", \"v_proj\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ceeae329-e931-4d52-8a28-9c87e5cdb4cf",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 3,199,488 || all params: 2,617,545,984 || trainable%: 0.1222\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = get_peft_model(model, peft_config)\n",
+    "model.print_trainable_parameters()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1b3d2544-3028-4e2a-9c56-d4d7d9d674de",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0d2d0381",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = AdamW(params=model.parameters(), lr=lr)\n",
+    "\n",
+    "# Instantiate scheduler\n",
+    "lr_scheduler = get_linear_schedule_with_warmup(\n",
+    "    optimizer=optimizer,\n",
+    "    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),\n",
+    "    num_training_steps=(len(train_dataloader) * num_epochs),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "f04c88ca-84eb-4184-afe6-3869b6f96b76",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PeftModelForSequenceClassification(\n",
+       "  (base_model): LoraModel(\n",
+       "    (model): Gemma2ForSequenceClassification(\n",
+       "      (model): Gemma2Model(\n",
+       "        (embed_tokens): Embedding(256000, 2304, padding_idx=0)\n",
+       "        (layers): ModuleList(\n",
+       "          (0-25): 26 x Gemma2DecoderLayer(\n",
+       "            (self_attn): Gemma2Attention(\n",
+       "              (q_proj): lora.TorchaoLoraLinear(\n",
+       "                (base_layer): Linear(in_features=2304, out_features=2048, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([2048, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "                (lora_dropout): ModuleDict(\n",
+       "                  (default): Dropout(p=0.1, inplace=False)\n",
+       "                )\n",
+       "                (lora_A): ModuleDict(\n",
+       "                  (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+       "                )\n",
+       "                (lora_B): ModuleDict(\n",
+       "                  (default): Linear(in_features=16, out_features=2048, bias=False)\n",
+       "                )\n",
+       "                (lora_embedding_A): ParameterDict()\n",
+       "                (lora_embedding_B): ParameterDict()\n",
+       "                (lora_magnitude_vector): ModuleDict()\n",
+       "              )\n",
+       "              (k_proj): Linear(in_features=2304, out_features=1024, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "              (v_proj): lora.TorchaoLoraLinear(\n",
+       "                (base_layer): Linear(in_features=2304, out_features=1024, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "                (lora_dropout): ModuleDict(\n",
+       "                  (default): Dropout(p=0.1, inplace=False)\n",
+       "                )\n",
+       "                (lora_A): ModuleDict(\n",
+       "                  (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+       "                )\n",
+       "                (lora_B): ModuleDict(\n",
+       "                  (default): Linear(in_features=16, out_features=1024, bias=False)\n",
+       "                )\n",
+       "                (lora_embedding_A): ParameterDict()\n",
+       "                (lora_embedding_B): ParameterDict()\n",
+       "                (lora_magnitude_vector): ModuleDict()\n",
+       "              )\n",
+       "              (o_proj): Linear(in_features=2048, out_features=2304, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([2304, 2048]), block_size=(1, 2048), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "              (rotary_emb): Gemma2RotaryEmbedding()\n",
+       "            )\n",
+       "            (mlp): Gemma2MLP(\n",
+       "              (gate_proj): Linear(in_features=2304, out_features=9216, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([9216, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "              (up_proj): Linear(in_features=2304, out_features=9216, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([9216, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "              (down_proj): Linear(in_features=9216, out_features=2304, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([2304, 9216]), block_size=(1, 9216), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "              (act_fn): PytorchGELUTanh()\n",
+       "            )\n",
+       "            (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "            (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "            (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "            (post_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "          )\n",
+       "        )\n",
+       "        (norm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "      )\n",
+       "      (score): ModulesToSaveWrapper(\n",
+       "        (original_module): Linear(in_features=2304, out_features=2, bias=False)\n",
+       "        (modules_to_save): ModuleDict(\n",
+       "          (default): Linear(in_features=2304, out_features=2, bias=False)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.config.use_cache = False\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "fa0e73be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                                                                                                                                                                                       | 0/230 [00:00<?, ?it/s]You're using a GemmaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:43<00:00,  5.27it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:04<00:00,  5.33it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 1 | train loss 1.7618 | {'accuracy': 0.46568627450980393, 'f1': 0.5458333333333333}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:43<00:00,  5.29it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:04<00:00,  5.47it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 2 | train loss 1.1905 | {'accuracy': 0.5245098039215687, 'f1': 0.6325757575757576}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:43<00:00,  5.32it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:04<00:00,  5.34it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 3 | train loss 1.1478 | {'accuracy': 0.5318627450980392, 'f1': 0.6456400742115028}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:43<00:00,  5.29it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:04<00:00,  5.36it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 4 | train loss 1.1384 | {'accuracy': 0.5367647058823529, 'f1': 0.6506469500924215}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:44<00:00,  5.21it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:04<00:00,  5.43it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 5 | train loss 1.1365 | {'accuracy': 0.5367647058823529, 'f1': 0.6506469500924215}\n",
+      "CPU times: user 4min 2s, sys: 399 ms, total: 4min 2s\n",
+      "Wall time: 4min 2s\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "for epoch in range(1, num_epochs + 1):\n",
+    "    model.train()\n",
+    "    train_losses = []\n",
+    "    for step, batch in enumerate(tqdm(train_dataloader)):\n",
+    "        batch.to(device)\n",
+    "        outputs = model(**batch)\n",
+    "        loss = outputs.loss\n",
+    "        if not torch.isfinite(loss):\n",
+    "            raise ValueError(\"non-finite loss encountered\")\n",
+    "\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        lr_scheduler.step()\n",
+    "        optimizer.zero_grad()\n",
+    "        train_losses.append(loss.item())\n",
+    "\n",
+    "    model.eval()\n",
+    "    for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+    "        batch.to(device)\n",
+    "        with torch.no_grad():\n",
+    "            outputs = model(**batch)\n",
+    "        predictions = outputs.logits.argmax(dim=-1)\n",
+    "        predictions, references = predictions, batch[\"labels\"]\n",
+    "        metric.add_batch(\n",
+    "            predictions=predictions,\n",
+    "            references=references,\n",
+    "        )\n",
+    "\n",
+    "    eval_metric = metric.compute()\n",
+    "    train_loss = sum(train_losses) / len(train_losses)\n",
+    "    print(f\"epoch {epoch} | train loss {train_loss:.4f} |\", eval_metric)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "6a1f937b-a0a5-40ec-8e41-5a5a18c6bff6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# memory: 4122MiB"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/sequence_classification/LoRA-torchao-8bit.ipynb b/examples/sequence_classification/LoRA-torchao-8bit.ipynb
new file mode 100644
index 0000000000..3f592776a5
--- /dev/null
+++ b/examples/sequence_classification/LoRA-torchao-8bit.ipynb
@@ -0,0 +1,526 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "412e41ee-72d3-4e71-bd3a-703b37429c57",
+   "metadata": {},
+   "source": [
+    "# PyTorch AO (torchao) with int8_weight_only"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10e1acc3-50b8-4d40-bdf3-0133c113cc4b",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a9935ae2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import argparse\n",
+    "import os\n",
+    "\n",
+    "import torch\n",
+    "from torch.optim import AdamW\n",
+    "from torch.utils.data import DataLoader\n",
+    "from peft import (\n",
+    "    get_peft_config,\n",
+    "    get_peft_model,\n",
+    "    get_peft_model_state_dict,\n",
+    "    set_peft_model_state_dict,\n",
+    "    LoraConfig,\n",
+    "    PeftType,\n",
+    "    PrefixTuningConfig,\n",
+    "    PromptEncoderConfig,\n",
+    ")\n",
+    "\n",
+    "import evaluate\n",
+    "from datasets import load_dataset\n",
+    "from transformers import AutoModelForSequenceClassification, AutoTokenizer, TorchAoConfig, get_linear_schedule_with_warmup, set_seed\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eafdd532-b1eb-4aac-8077-3386a84c7cdb",
+   "metadata": {},
+   "source": [
+    "## Parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e3b13308",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 16\n",
+    "model_name_or_path = \"google/gemma-2-2b\"\n",
+    "task = \"mrpc\"\n",
+    "device = \"cuda\"\n",
+    "num_epochs = 5\n",
+    "lr = 2e-5\n",
+    "\n",
+    "lora_rank = 16\n",
+    "lora_alpha = 32\n",
+    "lora_dropout = 0.1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c7fb69bf-0182-4111-b715-e2e659b42b1d",
+   "metadata": {},
+   "source": [
+    "## Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d2f4d25e-30b9-431f-95c3-adb390dc6fcd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if any(k in model_name_or_path for k in (\"gpt\", \"opt\", \"bloom\")):\n",
+    "    padding_side = \"left\"\n",
+    "else:\n",
+    "    padding_side = \"right\"\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)\n",
+    "if getattr(tokenizer, \"pad_token_id\") is None:\n",
+    "    tokenizer.pad_token_id = tokenizer.eos_token_id\n",
+    "\n",
+    "datasets = load_dataset(\"glue\", task)\n",
+    "metric = evaluate.load(\"glue\", task)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1ea852bc-a040-4244-8fd3-516307cecd14",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_function(examples):\n",
+    "    # max_length=None => use the model max length (it's actually the default)\n",
+    "    outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=None)\n",
+    "    return outputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "cf5ef289-f42f-4582-bd5e-9852ad8beff2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenized_datasets = datasets.map(\n",
+    "    tokenize_function,\n",
+    "    batched=True,\n",
+    "    remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n",
+    ")\n",
+    "\n",
+    "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n",
+    "# transformers library\n",
+    "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "739b3655-9db0-48bc-8542-308c6d5e0b8b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def collate_fn(examples):\n",
+    "    return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0288f311-8475-4a0e-99af-e4b909d10e01",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Instantiate dataloaders.\n",
+    "train_dataloader = DataLoader(\n",
+    "    tokenized_datasets[\"train\"],\n",
+    "    shuffle=True,\n",
+    "    collate_fn=collate_fn,\n",
+    "    batch_size=batch_size,\n",
+    ")\n",
+    "eval_dataloader = DataLoader(\n",
+    "    tokenized_datasets[\"validation\"],\n",
+    "    shuffle=False,\n",
+    "    collate_fn=collate_fn,\n",
+    "    batch_size=batch_size,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fcaf6f9e-c9d1-445a-9f08-18ef462f67ce",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e5dfff56-ea80-4561-aeaf-43216bbb9af7",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "512d9dc10a4d4ecc88b9440575b0973a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "quant_config = TorchAoConfig(quant_type=\"int8_weight_only\")\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\n",
+    "    model_name_or_path, return_dict=True, device_map=0, torch_dtype=torch.bfloat16, quantization_config=quant_config\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "0526f571",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "peft_config = LoraConfig(\n",
+    "    task_type=\"SEQ_CLS\",\n",
+    "    r=lora_rank,\n",
+    "    lora_alpha=lora_alpha,\n",
+    "    lora_dropout=lora_dropout,\n",
+    "    target_modules=[\"q_proj\", \"v_proj\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ceeae329-e931-4d52-8a28-9c87e5cdb4cf",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 3,199,488 || all params: 2,617,545,984 || trainable%: 0.1222\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = get_peft_model(model, peft_config)\n",
+    "model.print_trainable_parameters()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1b3d2544-3028-4e2a-9c56-d4d7d9d674de",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0d2d0381",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = AdamW(params=model.parameters(), lr=lr)\n",
+    "\n",
+    "# Instantiate scheduler\n",
+    "lr_scheduler = get_linear_schedule_with_warmup(\n",
+    "    optimizer=optimizer,\n",
+    "    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),\n",
+    "    num_training_steps=(len(train_dataloader) * num_epochs),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "f04c88ca-84eb-4184-afe6-3869b6f96b76",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PeftModelForSequenceClassification(\n",
+       "  (base_model): LoraModel(\n",
+       "    (model): Gemma2ForSequenceClassification(\n",
+       "      (model): Gemma2Model(\n",
+       "        (embed_tokens): Embedding(256000, 2304, padding_idx=0)\n",
+       "        (layers): ModuleList(\n",
+       "          (0-25): 26 x Gemma2DecoderLayer(\n",
+       "            (self_attn): Gemma2Attention(\n",
+       "              (q_proj): lora.TorchaoLoraLinear(\n",
+       "                (base_layer): Linear(in_features=2304, out_features=2048, weight=AffineQuantizedTensor(shape=torch.Size([2048, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "                (lora_dropout): ModuleDict(\n",
+       "                  (default): Dropout(p=0.1, inplace=False)\n",
+       "                )\n",
+       "                (lora_A): ModuleDict(\n",
+       "                  (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+       "                )\n",
+       "                (lora_B): ModuleDict(\n",
+       "                  (default): Linear(in_features=16, out_features=2048, bias=False)\n",
+       "                )\n",
+       "                (lora_embedding_A): ParameterDict()\n",
+       "                (lora_embedding_B): ParameterDict()\n",
+       "                (lora_magnitude_vector): ModuleDict()\n",
+       "              )\n",
+       "              (k_proj): Linear(in_features=2304, out_features=1024, weight=AffineQuantizedTensor(shape=torch.Size([1024, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "              (v_proj): lora.TorchaoLoraLinear(\n",
+       "                (base_layer): Linear(in_features=2304, out_features=1024, weight=AffineQuantizedTensor(shape=torch.Size([1024, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "                (lora_dropout): ModuleDict(\n",
+       "                  (default): Dropout(p=0.1, inplace=False)\n",
+       "                )\n",
+       "                (lora_A): ModuleDict(\n",
+       "                  (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+       "                )\n",
+       "                (lora_B): ModuleDict(\n",
+       "                  (default): Linear(in_features=16, out_features=1024, bias=False)\n",
+       "                )\n",
+       "                (lora_embedding_A): ParameterDict()\n",
+       "                (lora_embedding_B): ParameterDict()\n",
+       "                (lora_magnitude_vector): ModuleDict()\n",
+       "              )\n",
+       "              (o_proj): Linear(in_features=2048, out_features=2304, weight=AffineQuantizedTensor(shape=torch.Size([2304, 2048]), block_size=(1, 2048), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "              (rotary_emb): Gemma2RotaryEmbedding()\n",
+       "            )\n",
+       "            (mlp): Gemma2MLP(\n",
+       "              (gate_proj): Linear(in_features=2304, out_features=9216, weight=AffineQuantizedTensor(shape=torch.Size([9216, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "              (up_proj): Linear(in_features=2304, out_features=9216, weight=AffineQuantizedTensor(shape=torch.Size([9216, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "              (down_proj): Linear(in_features=9216, out_features=2304, weight=AffineQuantizedTensor(shape=torch.Size([2304, 9216]), block_size=(1, 9216), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "              (act_fn): PytorchGELUTanh()\n",
+       "            )\n",
+       "            (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "            (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "            (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "            (post_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "          )\n",
+       "        )\n",
+       "        (norm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "      )\n",
+       "      (score): ModulesToSaveWrapper(\n",
+       "        (original_module): Linear(in_features=2304, out_features=2, bias=False)\n",
+       "        (modules_to_save): ModuleDict(\n",
+       "          (default): Linear(in_features=2304, out_features=2, bias=False)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.config.use_cache = False\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "fa0e73be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                                                                                                                                                                                       | 0/230 [00:00<?, ?it/s]You're using a GemmaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.19it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.19it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 1 | train loss 1.0672 | {'accuracy': 0.6715686274509803, 'f1': 0.7751677852348994}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.26it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.19it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 2 | train loss 0.6261 | {'accuracy': 0.7377450980392157, 'f1': 0.8201680672268907}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.25it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.15it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 3 | train loss 0.4743 | {'accuracy': 0.7867647058823529, 'f1': 0.8502581755593803}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.30it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.17it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 4 | train loss 0.4006 | {'accuracy': 0.803921568627451, 'f1': 0.8586572438162544}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.26it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.10it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 5 | train loss 0.3585 | {'accuracy': 0.8235294117647058, 'f1': 0.8791946308724832}\n",
+      "CPU times: user 2min 8s, sys: 38 s, total: 2min 46s\n",
+      "Wall time: 2min 46s\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "for epoch in range(1, num_epochs + 1):\n",
+    "    model.train()\n",
+    "    train_losses = []\n",
+    "    for step, batch in enumerate(tqdm(train_dataloader)):\n",
+    "        batch.to(device)\n",
+    "        outputs = model(**batch)\n",
+    "        loss = outputs.loss\n",
+    "        if not torch.isfinite(loss):\n",
+    "            raise ValueError(\"non-finite loss encountered\")\n",
+    "\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        lr_scheduler.step()\n",
+    "        optimizer.zero_grad()\n",
+    "        train_losses.append(loss.item())\n",
+    "\n",
+    "    model.eval()\n",
+    "    for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+    "        batch.to(device)\n",
+    "        with torch.no_grad():\n",
+    "            outputs = model(**batch)\n",
+    "        predictions = outputs.logits.argmax(dim=-1)\n",
+    "        predictions, references = predictions, batch[\"labels\"]\n",
+    "        metric.add_batch(\n",
+    "            predictions=predictions,\n",
+    "            references=references,\n",
+    "        )\n",
+    "\n",
+    "    eval_metric = metric.compute()\n",
+    "    train_loss = sum(train_losses) / len(train_losses)\n",
+    "    print(f\"epoch {epoch} | train loss {train_loss:.4f} |\", eval_metric)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "6a1f937b-a0a5-40ec-8e41-5a5a18c6bff6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# memory: 18098MiB"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/peft/import_utils.py b/src/peft/import_utils.py
index 58c65f9c80..ff24e9fe31 100644
--- a/src/peft/import_utils.py
+++ b/src/peft/import_utils.py
@@ -87,3 +87,19 @@ def is_eetq_available():
 @lru_cache
 def is_hqq_available():
     return importlib.util.find_spec("hqq") is not None
+
+
+@lru_cache
+def is_torchao_available():
+    if importlib.util.find_spec("torchao") is None:
+        return False
+
+    TORCHAO_MINIMUM_VERSION = packaging.version.parse("0.4.0")
+    torchao_version = packaging.version.parse(importlib_metadata.version("torchao"))
+
+    if torchao_version < TORCHAO_MINIMUM_VERSION:
+        raise ImportError(
+            f"Found an incompatible version of torchao. Found version {torchao_version}, "
+            f"but only versions above {TORCHAO_MINIMUM_VERSION} are supported"
+        )
+    return True
diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
index 5ef2740c44..edb1273381 100644
--- a/src/peft/tuners/lora/model.py
+++ b/src/peft/tuners/lora/model.py
@@ -53,6 +53,7 @@
 from .gptq import dispatch_gptq
 from .hqq import dispatch_hqq
 from .layer import Conv2d, LoraLayer, dispatch_default
+from .torchao import dispatch_torchao
 from .tp_layer import dispatch_megatron
 
 
@@ -202,6 +203,13 @@ def _create_and_replace(
             "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False),
             "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False),
         }
+        # for torchao merging, we need the get_apply_tensor_subclass from the quantization config
+        try:
+            kwargs["get_apply_tensor_subclass"] = operator.attrgetter(
+                "hf_quantizer.quantization_config.get_apply_tensor_subclass"
+            )(self.model)
+        except AttributeError:
+            pass
 
         quant_methods = ["gptq", "aqlm", "awq"]
         for quant_method in quant_methods:
@@ -334,6 +342,7 @@ def dynamic_dispatch_func(target, adapter_name, lora_config, **kwargs):
                 dispatch_awq,
                 dispatch_gptq,
                 dispatch_hqq,
+                dispatch_torchao,
                 dispatch_megatron,
                 dispatch_default,
             ]
diff --git a/src/peft/tuners/lora/torchao.py b/src/peft/tuners/lora/torchao.py
new file mode 100644
index 0000000000..1e0ebaeab3
--- /dev/null
+++ b/src/peft/tuners/lora/torchao.py
@@ -0,0 +1,146 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import warnings
+from typing import Any, Optional
+
+import torch
+
+# from torch import nn
+from peft.import_utils import is_torchao_available
+from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
+
+from .config import LoraConfig
+from .layer import Linear
+
+
+class TorchaoLoraLinear(Linear):
+    """LoRA layer implementation for Linear layers using torchao data"""
+
+    def __init__(self, *args, get_apply_tensor_subclass, **kwargs):
+        # this is not strictly necessary, as kwargs are stored either way, but we want to error early if
+        # get_apply_tensor_subclass is missing.
+        super().__init__(*args, **kwargs)
+        self.get_apply_tensor_subclass = get_apply_tensor_subclass
+        self._check_dtype_supported()
+
+    def _check_dtype_supported(self):
+        # TODO: Not required once int4_weight_only is properly supported by torchao
+        base_layer = self.get_base_layer()
+        weight = base_layer.weight
+        if hasattr(weight, "layout_tensor") and (weight.layout_tensor.data.dtype != torch.int8):
+            raise ValueError(f"{type(self).__name__} only supports int8 weights for now.")
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
+        from torchao import quantize_
+
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        self._check_dtype_supported()
+
+        base_layer = self.get_base_layer()
+        weight = base_layer.weight
+
+        for active_adapter in adapter_names:
+            try:
+                weight = weight.dequantize()
+            except NotImplementedError as exc:
+                msg = (
+                    f"Weights of type {type(weight).__name__} do not support dequantization (yet), which is needed to "
+                    "support merging."
+                )
+                raise NotImplementedError(msg) from exc
+
+            if safe_merge and not torch.isfinite(weight).all():
+                raise ValueError(
+                    f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                )
+
+            weight += self.get_delta_weight(active_adapter)
+            # TODO: once (if) torchao supports directly mutating the data, use that instead.
+            del base_layer.weight
+            base_layer.weight = weight
+            quantize_(base_layer, self.get_apply_tensor_subclass())
+            del weight
+
+            self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        from torchao import quantize_
+
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter not in self.lora_A.keys():
+                continue
+
+            base_layer = self.get_base_layer()
+            weight = base_layer.weight
+            try:
+                weight = weight.dequantize()
+            except NotImplementedError as exc:
+                msg = (
+                    f"Weights of type {type(weight).__name__} do not support dequantization (yet), which is needed to "
+                    "support unmerging."
+                )
+                raise NotImplementedError(msg) from exc
+
+            weight -= self.get_delta_weight(active_adapter)
+            # We go through a dummy module because overriding the weight.data does not work, the tensor retains the old
+            # data. Therefore, we need to go through quantize_, which takes a module as input, and we need to delete and
+            # re-assign the weight.
+            # TODO: once (if) torchao supports directly mutating the data, use that instead.
+            del base_layer.weight
+            base_layer.weight = weight
+            quantize_(base_layer, self.get_apply_tensor_subclass())
+            del weight
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return rep.replace("lora.Linear", f"lora.{self.__class__.__name__}")
+
+
+def dispatch_torchao(
+    target: torch.nn.Module,
+    adapter_name: str,
+    lora_config: LoraConfig,
+    **kwargs: Any,
+) -> Optional[torch.nn.Module]:
+    new_module = None
+
+    if isinstance(target, BaseTunerLayer):
+        target_base_layer = target.get_base_layer()
+    else:
+        target_base_layer = target
+
+    if not hasattr(target_base_layer, "weight"):
+        return new_module
+
+    if not is_torchao_available():
+        return new_module
+
+    from torchao.dtypes import AffineQuantizedTensor
+    from torchao.quantization import LinearActivationQuantizedTensor
+
+    if isinstance(target_base_layer.weight, (AffineQuantizedTensor, LinearActivationQuantizedTensor)):
+        new_module = TorchaoLoraLinear(target, adapter_name, **kwargs)
+
+    return new_module
diff --git a/src/peft/utils/integrations.py b/src/peft/utils/integrations.py
index 4a23809317..bf9bd2aecc 100644
--- a/src/peft/utils/integrations.py
+++ b/src/peft/utils/integrations.py
@@ -52,6 +52,10 @@ def dequantize_module_weight(module: torch.nn.Module) -> torch.nn.Parameter:
     if hasattr(module, "W_q"):  # For handling HQQ quantized weight
         weight = module.dequantize()
         return weight
+    elif type(module.weight).__module__.startswith("torchao."):
+        # check for torchao without requiring any torchao imports
+        weight = module.weight.dequantize()
+        return weight
 
     weight = module.weight
     if not isinstance(weight, torch.nn.Parameter):
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index c38192d8f8..b499067ae5 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -114,6 +114,7 @@ def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True, grad
     is_gptq_quantized = getattr(model, "quantization_method", None) == "gptq"
     is_aqlm_quantized = getattr(model, "quantization_method", None) == "aqlm"
     is_eetq_quantized = getattr(model, "quantization_method", None) == "eetq"
+    is_torchao_quantized = getattr(model, "quantization_method", None) == "torchao"
     is_hqq_quantized = getattr(model, "quantization_method", None) == "hqq" or getattr(model, "hqq_quantized", False)
 
     if gradient_checkpointing_kwargs is None:
@@ -123,7 +124,13 @@ def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True, grad
         # freeze base model's layers
         param.requires_grad = False
 
-    if not is_gptq_quantized and not is_aqlm_quantized and not is_eetq_quantized and not is_hqq_quantized:
+    if (
+        not is_gptq_quantized
+        and not is_aqlm_quantized
+        and not is_eetq_quantized
+        and not is_hqq_quantized
+        and not is_torchao_quantized
+    ):
         # cast all non INT8 parameters to fp32
         for param in model.parameters():
             if (
@@ -132,7 +139,12 @@ def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True, grad
                 param.data = param.data.to(torch.float32)
 
     if (
-        loaded_in_kbit or is_gptq_quantized or is_aqlm_quantized or is_eetq_quantized or is_hqq_quantized
+        loaded_in_kbit
+        or is_gptq_quantized
+        or is_aqlm_quantized
+        or is_eetq_quantized
+        or is_hqq_quantized
+        or is_torchao_quantized
     ) and use_gradient_checkpointing:
         # When having `use_reentrant=False` + gradient_checkpointing, there is no need for this hack
         if "use_reentrant" not in gradient_checkpointing_kwargs or gradient_checkpointing_kwargs["use_reentrant"]:
diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py
index aa3a6e5005..2201b2d81e 100644
--- a/tests/test_gpu_examples.py
+++ b/tests/test_gpu_examples.py
@@ -1,5 +1,4 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
+# Copyright 2023-present the HuggingFace Inc. team.#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,6 +13,7 @@
 import gc
 import importlib
 import os
+import re
 import tempfile
 import unittest
 from collections import Counter
@@ -79,6 +79,7 @@
     require_optimum,
     require_torch_gpu,
     require_torch_multi_gpu,
+    require_torchao,
 )
 
 
@@ -3242,6 +3243,371 @@ def test_causal_lm_training_multi_gpu_eetq(self):
             assert trainer.state.log_history[-1]["train_loss"] is not None
 
 
+@require_non_xpu
+@require_torch_gpu
+@require_torchao
+class PeftTorchaoGPUTests(unittest.TestCase):
+    r"""
+    torchao + peft tests
+    """
+
+    supported_quant_types = [
+        "int8_weight_only",
+        "int8_dynamic_activation_int8_weight",
+        # int4_weight_only raises an error:
+        # RuntimeError: derivative for aten::_weight_int4pack_mm is not implemented
+        # "int4_weight_only",
+    ]
+
+    def setUp(self):
+        self.causal_lm_model_id = "facebook/opt-125m"
+        self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id)
+
+    def tearDown(self):
+        r"""
+        Efficient mechanism to free GPU memory after each test. Based on
+        https://github.com/huggingface/transformers/issues/21094
+        """
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @parameterized.expand(supported_quant_types)
+    @pytest.mark.single_gpu_tests
+    def test_causal_lm_training_single_gpu_torchao(self, quant_type):
+        from transformers import TorchAoConfig
+
+        device = 0
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            quantization_config = TorchAoConfig(quant_type=quant_type)
+            model = AutoModelForCausalLM.from_pretrained(
+                self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
+            )
+            model = prepare_model_for_kbit_training(model)
+
+            config = LoraConfig(
+                r=16,
+                lora_alpha=32,
+                target_modules=["q_proj", "v_proj"],
+                lora_dropout=0.05,
+                bias="none",
+                task_type="CAUSAL_LM",
+            )
+            model = get_peft_model(model, config)
+
+            data = load_dataset("ybelkada/english_quotes_copy")
+            data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True)
+
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
+            )
+            trainer.model.config.use_cache = False
+            trainer.train()
+
+            model.save_pretrained(tmp_dir)
+
+            assert "adapter_config.json" in os.listdir(tmp_dir)
+            assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir)
+
+            # assert loss is not None
+            assert trainer.state.log_history[-1]["train_loss"] is not None
+
+    @pytest.mark.single_gpu_tests
+    def test_causal_lm_training_single_gpu_torchao_dora_int8_weight_only(self):
+        from transformers import TorchAoConfig
+
+        device = 0
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            quantization_config = TorchAoConfig(quant_type="int8_weight_only")
+            model = AutoModelForCausalLM.from_pretrained(
+                self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
+            )
+            model = prepare_model_for_kbit_training(model)
+
+            config = LoraConfig(
+                r=16,
+                lora_alpha=32,
+                target_modules=["q_proj", "v_proj"],
+                lora_dropout=0.05,
+                bias="none",
+                task_type="CAUSAL_LM",
+                use_dora=True,
+            )
+            model = get_peft_model(model, config)
+
+            data = load_dataset("ybelkada/english_quotes_copy")
+            data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True)
+
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
+            )
+            trainer.model.config.use_cache = False
+            trainer.train()
+
+            model.save_pretrained(tmp_dir)
+
+            assert "adapter_config.json" in os.listdir(tmp_dir)
+            assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir)
+
+            # assert loss is not None
+            assert trainer.state.log_history[-1]["train_loss"] is not None
+
+    @pytest.mark.single_gpu_tests
+    def test_causal_lm_training_single_gpu_torchao_dora_int8_dynamic_activation_int8_weight_raises(self):
+        from transformers import TorchAoConfig
+
+        device = 0
+
+        quantization_config = TorchAoConfig(quant_type="int8_dynamic_activation_int8_weight")
+        model = AutoModelForCausalLM.from_pretrained(
+            self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
+        )
+        model = prepare_model_for_kbit_training(model)
+
+        config = LoraConfig(
+            r=16,
+            lora_alpha=32,
+            target_modules=["q_proj", "v_proj"],
+            lora_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM",
+            use_dora=True,
+        )
+        with pytest.raises(NotImplementedError):
+            get_peft_model(model, config)
+
+    @pytest.mark.single_gpu_tests
+    def test_causal_lm_training_single_gpu_torchao_int4_raises(self):
+        # int4_weight_only raises an error:
+        # RuntimeError: derivative for aten::_weight_int4pack_mm is not implemented
+        # TODO: Once proper torchao support for int4 is added, remove this test and add int4 to supported_quant_types
+        from transformers import TorchAoConfig
+
+        device = 0
+
+        quantization_config = TorchAoConfig(quant_type="int4_weight_only")
+        model = AutoModelForCausalLM.from_pretrained(
+            self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
+        )
+        model = prepare_model_for_kbit_training(model)
+
+        config = LoraConfig(
+            r=16,
+            lora_alpha=32,
+            target_modules=["q_proj", "v_proj"],
+            lora_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+
+        msg = re.escape("TorchaoLoraLinear only supports int8 weights for now")
+        with pytest.raises(ValueError, match=msg):
+            get_peft_model(model, config)
+
+    @parameterized.expand(supported_quant_types)
+    @pytest.mark.multi_gpu_tests
+    @require_torch_multi_gpu
+    def test_causal_lm_training_multi_gpu_torchao(self, quant_type):
+        from transformers import TorchAoConfig
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            quantization_config = TorchAoConfig(quant_type=quant_type)
+            model = AutoModelForCausalLM.from_pretrained(
+                self.causal_lm_model_id,
+                device_map="auto",
+                quantization_config=quantization_config,
+                torch_dtype=torch.bfloat16,
+            )
+
+            assert set(model.hf_device_map.values()) == set(range(torch.cuda.device_count()))
+
+            model = prepare_model_for_kbit_training(model)
+            model.model_parallel = True
+            model.is_parallelizable = True
+
+            config = LoraConfig(
+                r=16,
+                lora_alpha=32,
+                target_modules=["q_proj", "v_proj"],
+                lora_dropout=0.05,
+                bias="none",
+                task_type="CAUSAL_LM",
+            )
+            model = get_peft_model(model, config)
+
+            data = load_dataset("ybelkada/english_quotes_copy")
+            data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True)
+
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
+            )
+            trainer.model.config.use_cache = False
+            trainer.train()
+
+            model.save_pretrained(tmp_dir)
+
+            assert "adapter_config.json" in os.listdir(tmp_dir)
+            assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir)
+
+            # assert loss is not None
+            assert trainer.state.log_history[-1]["train_loss"] is not None
+
+    @pytest.mark.multi_gpu_tests
+    @require_torch_multi_gpu
+    def test_causal_lm_training_multi_gpu_torchao_int4_raises(self):
+        # int4_weight_only raises an error:
+        # RuntimeError: derivative for aten::_weight_int4pack_mm is not implemented
+        # TODO: Once proper torchao support for int4 is added, remove this test and add int4 to supported_quant_types
+        from transformers import TorchAoConfig
+
+        quantization_config = TorchAoConfig(quant_type="int4_weight_only")
+        model = AutoModelForCausalLM.from_pretrained(
+            self.causal_lm_model_id,
+            device_map="auto",
+            quantization_config=quantization_config,
+            torch_dtype=torch.bfloat16,
+        )
+
+        assert set(model.hf_device_map.values()) == set(range(torch.cuda.device_count()))
+
+        model = prepare_model_for_kbit_training(model)
+        model.model_parallel = True
+        model.is_parallelizable = True
+
+        config = LoraConfig(
+            r=16,
+            lora_alpha=32,
+            target_modules=["q_proj", "v_proj"],
+            lora_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+
+        msg = re.escape("TorchaoLoraLinear only supports int8 weights for now")
+        with pytest.raises(ValueError, match=msg):
+            get_peft_model(model, config)
+
+    @pytest.mark.single_gpu_tests
+    def test_torchao_merge_layers_int8_weight_only(self):
+        from torchao.dtypes import AffineQuantizedTensor
+        from transformers import TorchAoConfig
+
+        quant_type = "int8_weight_only"
+        torch.manual_seed(0)
+        device = 0
+        dummy_input = torch.arange(10).view(-1, 1).to(device)
+
+        quantization_config = TorchAoConfig(quant_type=quant_type)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
+        ).eval()
+        logits_base = model(dummy_input)[0]
+
+        config = LoraConfig(
+            r=16,
+            lora_alpha=32,
+            target_modules=["q_proj", "v_proj"],
+            lora_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM",
+            init_lora_weights=False,
+        )
+        model = get_peft_model(model, config)
+
+        model.eval()
+        logits = model(dummy_input)[0]
+
+        # sanity check: outputs changed
+        # precision is quite low, so we need to use high atol and rtol
+        atol, rtol = 1e-1, 1e-1
+        assert not torch.allclose(logits, logits_base, atol=atol, rtol=rtol)
+
+        model.merge_adapter()
+        logits_merged = model(dummy_input)[0]
+        for name, module in model.named_modules():
+            if "base_layer" in name:
+                assert isinstance(module.weight, AffineQuantizedTensor)
+
+        model.unmerge_adapter()
+        logits_unmerged = model(dummy_input)[0]
+        for name, module in model.named_modules():
+            if "base_layer" in name:
+                assert isinstance(module.weight, AffineQuantizedTensor)
+
+        model = model.merge_and_unload()
+        logits_merged_unloaded = model(dummy_input)[0]
+
+        assert torch.allclose(logits, logits_merged, atol=atol, rtol=rtol)
+        assert torch.allclose(logits, logits_unmerged, atol=atol, rtol=rtol)
+        assert torch.allclose(logits, logits_merged_unloaded, atol=atol, rtol=rtol)
+
+    @pytest.mark.single_gpu_tests
+    def test_torchao_merge_layers_int8_dynamic_activation_int8_weight_raises(self):
+        # int8_dynamic_activation_int8_weight does not support dequantize, thus merging does not work
+        from transformers import TorchAoConfig
+
+        quant_type = "int8_dynamic_activation_int8_weight"
+        torch.manual_seed(0)
+        device = 0
+
+        quantization_config = TorchAoConfig(quant_type=quant_type)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
+        ).eval()
+
+        config = LoraConfig(
+            r=16,
+            lora_alpha=32,
+            target_modules=["q_proj", "v_proj"],
+            lora_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM",
+            init_lora_weights=False,
+        )
+        model = get_peft_model(model, config)
+
+        msg = re.escape(
+            "Weights of type LinearActivationQuantizedTensor do not support dequantization (yet), which is needed to "
+            "support merging."
+        )
+        with pytest.raises(NotImplementedError, match=msg):
+            model.merge_adapter()
+
+
 PRECISIONS = [(torch.float32), (torch.float16), (torch.bfloat16)]
 
 LORA_PARAMS = {
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index 0eeb643bc6..32bd6515ad 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -26,6 +26,7 @@
     is_eetq_available,
     is_hqq_available,
     is_optimum_available,
+    is_torchao_available,
 )
 
 
@@ -132,6 +133,13 @@ def require_optimum(test_case):
     return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case)
 
 
+def require_torchao(test_case):
+    """
+    Decorator marking a test that requires torchao. These tests are skipped when torchao isn't installed.
+    """
+    return unittest.skipUnless(is_torchao_available(), "test requires torchao")(test_case)
+
+
 @contextmanager
 def temp_seed(seed: int):
     """Temporarily set the random seed. This works for python numpy, pytorch."""

From a724834ac43b9478b066d3ec8b421489151f3815 Mon Sep 17 00:00:00 2001
From: suyang160 <2515255687@qq.com>
Date: Tue, 8 Oct 2024 17:44:22 +0100
Subject: [PATCH 14/22] FIX: PiSSA now works with Conv1D layers (#2103) (#2104)

Transpose weight matrix based on fan_in_fan_out condition in PiSSA
initialization.

Co-authored-by: Yang Su <suyang360@gmail.com>
---
 src/peft/tuners/lora/layer.py |  4 ++--
 tests/test_gpu_examples.py    | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
index 89f0d0406d..f3359ca9a8 100644
--- a/src/peft/tuners/lora/layer.py
+++ b/src/peft/tuners/lora/layer.py
@@ -221,7 +221,7 @@ def pissa_init(self, adapter_name, init_lora_weights):
                 "Please initialize PiSSA under float32, float16, or bfloat16. "
                 "Subsequently, re-quantize the residual model to help minimize quantization errors."
             )
-        weight = weight.to(torch.float32)
+        weight = transpose(weight.to(torch.float32), self.fan_in_fan_out)
         if init_lora_weights == "pissa":
             # USV^T = W <-> VSU^T = W^T, where W^T = weight.data in R^{out_channel, in_channel},
             V, S, Uh = torch.linalg.svd(weight.data, full_matrices=False)
@@ -245,7 +245,7 @@ def pissa_init(self, adapter_name, init_lora_weights):
         self.lora_A[adapter_name].weight.data = lora_A
         self.lora_B[adapter_name].weight.data = lora_B
         weight = weight.data - self.scaling[adapter_name] * lora_B @ lora_A
-        weight = weight.to(dtype)
+        weight = transpose(weight.to(dtype), self.fan_in_fan_out)
         self.get_base_layer().weight.data = weight
 
     def loftq_init(self, adapter_name):
diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py
index 2201b2d81e..afa6a15d9c 100644
--- a/tests/test_gpu_examples.py
+++ b/tests/test_gpu_examples.py
@@ -46,6 +46,7 @@
     WhisperProcessor,
     WhisperTokenizer,
 )
+from transformers.pytorch_utils import Conv1D
 
 from peft import (
     AdaLoraConfig,
@@ -1719,7 +1720,7 @@ def quantize_model(self, model, num_bits=4, device="cuda"):
         # Quantize the `weight.data` of the linear layer in the model to `num_bits` and store it with full precision.
         quantizer = NFQuantizer(num_bits=num_bits, device=device, method="normal", block_size=64)
         for name, module in model.named_modules():
-            if isinstance(module, torch.nn.Linear) and "lm_head" not in name:
+            if isinstance(module, (torch.nn.Linear, Conv1D)) and "lm_head" not in name:
                 quantized_weight, max_abs, shape = quantizer.quantize_block(module.weight.data.to(device))
                 module.weight.data = quantizer.dequantize_block(quantized_weight, max_abs, shape)
         return model
@@ -1728,7 +1729,7 @@ def nuclear_norm(self, base_model, quantized_model):
         # Calculate the nuclear norm (sum of singular values) of the error matrices between the `quantized_model` and the `base_model`.
         error_list = []
         for name, module in base_model.named_modules():
-            if isinstance(module, torch.nn.Linear) and "lm_head" not in name:
+            if isinstance(module, (torch.nn.Linear, Conv1D)) and "lm_head" not in name:
                 quant_module = quantized_model.get_submodule(name)
                 error_list.append(torch.linalg.svdvals(module.weight.data - quant_module.weight.data).sum())
         return torch.Tensor(error_list).sum()
@@ -1822,6 +1823,16 @@ def test_t5_pissa_4bit(self, device, tmp_path):
     def test_t5_pissa_8bit(self, device, tmp_path):
         self.get_errors(bits=8, device=device, model_id="t5-small", tmp_path=tmp_path)
 
+    @pytest.mark.parametrize("device", ["cuda", "cpu"])
+    def test_gpt2_pissa_4bit(self, device, tmp_path):
+        # see 2104
+        self.get_errors(bits=4, device=device, model_id="gpt2", tmp_path=tmp_path)
+
+    @pytest.mark.parametrize("device", ["cuda", "cpu"])
+    def test_gpt2_pissa_8bit(self, device, tmp_path):
+        # see 2104
+        self.get_errors(bits=8, device=device, model_id="gpt2", tmp_path=tmp_path)
+
     @require_bitsandbytes
     def test_lora_pissa_conversion_same_output_after_loading_with_quantization(self, tmp_path):
         # A copy of the test `test_lora_pissa_conversion_same_output_after_loading` in peft/tests/test_initialization.py,

From 3b314cc98b43aff65c265c1c0bb9f9ab4d01bea1 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Wed, 9 Oct 2024 12:16:46 +0200
Subject: [PATCH 15/22] FIX Type annoations in vera/bnb.py (#2139)

The file was missing the from __future__ import annotations part. As
this code is only running nightly with GPU, the normal CI missed this
omission.
---
 src/peft/tuners/vera/bnb.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/peft/tuners/vera/bnb.py b/src/peft/tuners/vera/bnb.py
index 272560d697..da5d37a039 100644
--- a/src/peft/tuners/vera/bnb.py
+++ b/src/peft/tuners/vera/bnb.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import warnings
 from typing import Optional

From 85e3202a000de08a1c1523b41714fbd300fa5dc7 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Wed, 9 Oct 2024 12:37:49 +0200
Subject: [PATCH 16/22] ENH Make PEFT configs forward compatible (#2038)

Right now, loading a PEFT config saved with a more recent PEFT version
than is currently installed will lead to errors when new arguments are
added to the config in the newer PEFT version. The current workaround is
for users to manually edit the adapter_config.json to remove those
entries.

With this PR, PEFT will make an attempt at removing these unknown keys
by inspecting the signature. The user will be warned about these removed
keys. This should generally be a safe measure because we will generally
not introduce new config settings that change the default behavior.
However, if a non-default is used, this could lead to wrong results.
This is mentioned in the warning.

While working on the tests, I also converted the unittest.TestCase to a
normal pytest test in order to be able to use pytest fixtures.

I also plan on adding the PEFT version to the adapter_config.json in the
future. This will allow us to better handle compatibility issues in the
future. As adding that new key to all PEFT configs could cause a lot of
disruption, I want to get this PR in first to ensure forward
compatibility.

Note that this new mechanism will not help anyone using a PEFT version
< 0.14.0, so this will be a slow transition.
---
 src/peft/config.py   | 45 ++++++++++++++++++++++++++-
 tests/test_config.py | 73 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 101 insertions(+), 17 deletions(-)

diff --git a/src/peft/config.py b/src/peft/config.py
index 7c2ad02fe4..124e6075ac 100644
--- a/src/peft/config.py
+++ b/src/peft/config.py
@@ -24,6 +24,25 @@
 from .utils import CONFIG_NAME, PeftType, TaskType
 
 
+# we expect at least these keys to be present in a PEFT adapter_config.json
+MIN_EXPECTED_CONFIG_KEYS = {"peft_type"}
+
+
+def _check_and_remove_unused_kwargs(cls, kwargs):
+    """Make PEFT configs forward-compatible by removing unused kwargs that were added in later PEFT versions.
+
+    This assumes that removing the unused kwargs will not affect the default behavior.
+
+    Returns the filtered kwargs and the set of removed keys.
+    """
+    # it's not pretty but eh
+    signature_parameters = inspect.signature(cls.__init__).parameters
+    unexpected_kwargs = set(kwargs.keys()) - set(signature_parameters.keys())
+    for key in unexpected_kwargs:
+        del kwargs[key]
+    return kwargs, unexpected_kwargs
+
+
 @dataclass
 class PeftConfigMixin(PushToHubMixin):
     r"""
@@ -116,7 +135,31 @@ def from_peft_type(cls, **kwargs):
         else:
             config_cls = cls
 
-        return config_cls(**kwargs)
+        try:
+            config = config_cls(**kwargs)
+        except TypeError as exc:
+            # Here we potentially handle forward compatibility. Sometimes new keywords are added to configs, which makes
+            # new configs incompatible with older PEFT versions. We catch these and remove them to allow the program to
+            # continue, but warn the user about it.
+
+            # First check if the error is due to unexpected keyword arguments, we don't want to accidentally catch
+            # other TypeErrors.
+            if "got an unexpected keyword argument" not in str(exc):
+                raise exc
+
+            filtered_kwargs, unexpected_kwargs = _check_and_remove_unused_kwargs(cls, kwargs)
+            if not MIN_EXPECTED_CONFIG_KEYS.issubset(set(filtered_kwargs.keys())):
+                raise TypeError(f"The config that is trying to be loaded is not a valid {cls.__name__} config.")
+
+            warnings.warn(
+                f"Unexpected keyword arguments {sorted(unexpected_kwargs)} for class {cls.__name__}, these are "
+                "ignored. This probably means that you're loading a configuration file that was saved using a "
+                "higher version of the library and additional parameters have been introduced since. It is "
+                "highly recommended to upgrade the PEFT version before continuing (e.g. by running `pip install "
+                "-U peft`)."
+            )
+            config = config_cls.from_peft_type(**filtered_kwargs)
+        return config
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: str, subfolder: Optional[str] = None, **kwargs):
diff --git a/tests/test_config.py b/tests/test_config.py
index ac76eade88..f02c28d197 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -16,11 +16,9 @@
 import os
 import pickle
 import tempfile
-import unittest
 import warnings
 
 import pytest
-from parameterized import parameterized
 
 from peft import (
     AdaLoraConfig,
@@ -34,6 +32,7 @@
     LoKrConfig,
     LoraConfig,
     MultitaskPromptTuningConfig,
+    OFTConfig,
     PeftConfig,
     PeftType,
     PolyConfig,
@@ -69,8 +68,8 @@
 )
 
 
-class PeftConfigTester(unittest.TestCase):
-    @parameterized.expand(ALL_CONFIG_CLASSES)
+class TestPeftConfig:
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
     def test_methods(self, config_class):
         r"""
         Test if all configs have the expected methods. Here we test
@@ -86,7 +85,7 @@ def test_methods(self, config_class):
         assert hasattr(config, "from_pretrained")
         assert hasattr(config, "from_json_file")
 
-    @parameterized.expand(ALL_CONFIG_CLASSES)
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
     def test_task_type(self, config_class):
         config_class(task_type="test")
 
@@ -102,7 +101,7 @@ def test_from_peft_type(self):
             config = PeftConfig.from_peft_type(peft_type=peft_type)
             assert type(config) is expected_cls
 
-    @parameterized.expand(ALL_CONFIG_CLASSES)
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
     def test_from_pretrained(self, config_class):
         r"""
         Test if the config is correctly loaded using:
@@ -112,7 +111,7 @@ def test_from_pretrained(self, config_class):
             # Test we can load config from delta
             config_class.from_pretrained(model_name, revision=revision)
 
-    @parameterized.expand(ALL_CONFIG_CLASSES)
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
     def test_save_pretrained(self, config_class):
         r"""
         Test if the config is correctly saved and loaded using
@@ -125,7 +124,7 @@ def test_save_pretrained(self, config_class):
             config_from_pretrained = config_class.from_pretrained(tmp_dirname)
             assert config.to_dict() == config_from_pretrained.to_dict()
 
-    @parameterized.expand(ALL_CONFIG_CLASSES)
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
     def test_from_json_file(self, config_class):
         config = config_class()
         with tempfile.TemporaryDirectory() as tmp_dirname:
@@ -143,7 +142,7 @@ def test_from_json_file(self, config_class):
             config_from_json = config_class.from_json_file(config_path)
             assert config.to_dict() == config_from_json
 
-    @parameterized.expand(ALL_CONFIG_CLASSES)
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
     def test_to_dict(self, config_class):
         r"""
         Test if the config can be correctly converted to a dict using:
@@ -152,7 +151,7 @@ def test_to_dict(self, config_class):
         config = config_class()
         assert isinstance(config.to_dict(), dict)
 
-    @parameterized.expand(ALL_CONFIG_CLASSES)
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
     def test_from_pretrained_cache_dir(self, config_class):
         r"""
         Test if the config is correctly loaded with extra kwargs
@@ -170,7 +169,7 @@ def test_from_pretrained_cache_dir_remote(self):
             PeftConfig.from_pretrained("ybelkada/test-st-lora", cache_dir=tmp_dirname)
             assert "models--ybelkada--test-st-lora" in os.listdir(tmp_dirname)
 
-    @parameterized.expand(ALL_CONFIG_CLASSES)
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
     def test_save_pretrained_with_runtime_config(self, config_class):
         r"""
         Test if the config correctly removes runtime config when saving
@@ -185,7 +184,7 @@ def test_save_pretrained_with_runtime_config(self, config_class):
                 cfg = config_class.from_pretrained(tmp_dirname)
                 assert not cfg.runtime_config.ephemeral_gpu_offload
 
-    @parameterized.expand(ALL_CONFIG_CLASSES)
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
     def test_set_attributes(self, config_class):
         # manually set attributes and check if they are correctly written
         config = config_class(peft_type="test")
@@ -197,21 +196,21 @@ def test_set_attributes(self, config_class):
             config_from_pretrained = config_class.from_pretrained(tmp_dirname)
             assert config.to_dict() == config_from_pretrained.to_dict()
 
-    @parameterized.expand(ALL_CONFIG_CLASSES)
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
     def test_config_copy(self, config_class):
         # see https://github.com/huggingface/peft/issues/424
         config = config_class()
         copied = copy.copy(config)
         assert config.to_dict() == copied.to_dict()
 
-    @parameterized.expand(ALL_CONFIG_CLASSES)
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
     def test_config_deepcopy(self, config_class):
         # see https://github.com/huggingface/peft/issues/424
         config = config_class()
         copied = copy.deepcopy(config)
         assert config.to_dict() == copied.to_dict()
 
-    @parameterized.expand(ALL_CONFIG_CLASSES)
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
     def test_config_pickle_roundtrip(self, config_class):
         # see https://github.com/huggingface/peft/issues/424
         config = config_class()
@@ -240,7 +239,9 @@ def test_prompt_encoder_warning_num_layers(self):
         expected_msg = "for MLP, the argument `encoder_num_layers` is ignored. Exactly 2 MLP layers are used."
         assert str(record.list[0].message) == expected_msg
 
-    @parameterized.expand([LoHaConfig, LoraConfig, IA3Config, BOFTConfig, HRAConfig, VBLoRAConfig])
+    @pytest.mark.parametrize(
+        "config_class", [LoHaConfig, LoraConfig, IA3Config, OFTConfig, BOFTConfig, HRAConfig, VBLoRAConfig]
+    )
     def test_save_pretrained_with_target_modules(self, config_class):
         # See #1041, #1045
         config = config_class(target_modules=["a", "list"])
@@ -310,3 +311,43 @@ def test_adalora_config_r_warning(self):
         # Test that a warning is raised when r != 8 in AdaLoraConfig
         with pytest.warns(UserWarning, match="Note that `r` is not used in AdaLora and will be ignored."):
             AdaLoraConfig(r=10)
+
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
+    def test_from_pretrained_forward_compatible(self, config_class, tmp_path, recwarn):
+        """
+        Make it possible to load configs that contain unknown keys by ignoring them.
+
+        The idea is to make PEFT configs forward-compatible with future versions of the library.
+        """
+        config = config_class()
+        config.save_pretrained(tmp_path)
+        # add a spurious key to the config
+        with open(tmp_path / "adapter_config.json") as f:
+            config_dict = json.load(f)
+        config_dict["foobar"] = "baz"
+        config_dict["spam"] = 123
+        with open(tmp_path / "adapter_config.json", "w") as f:
+            json.dump(config_dict, f)
+
+        msg = f"Unexpected keyword arguments ['foobar', 'spam'] for class {config_class.__name__}, these are ignored."
+        config_from_pretrained = config_class.from_pretrained(tmp_path)
+
+        assert len(recwarn) == 1
+        assert recwarn.list[0].message.args[0].startswith(msg)
+        assert "foo" not in config_from_pretrained.to_dict()
+        assert "spam" not in config_from_pretrained.to_dict()
+        assert config.to_dict() == config_from_pretrained.to_dict()
+        assert isinstance(config_from_pretrained, config_class)
+
+    @pytest.mark.parametrize("config_class", ALL_CONFIG_CLASSES)
+    def test_from_pretrained_sanity_check(self, config_class, tmp_path):
+        """Following up on the previous test about forward compatibility, we *don't* want any random json to be accepted as
+        a PEFT config. There should be a minimum set of required keys.
+        """
+        non_peft_json = {"foo": "bar", "baz": 123}
+        with open(tmp_path / "adapter_config.json", "w") as f:
+            json.dump(non_peft_json, f)
+
+        msg = f"The config that is trying to be loaded is not a valid {config_class.__name__} config"
+        with pytest.raises(TypeError, match=msg):
+            config_class.from_pretrained(tmp_path)

From 8efa0cb7355a933719c9ce5a76c0a262aaff18ed Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Wed, 9 Oct 2024 15:53:28 +0200
Subject: [PATCH 17/22] FIX Raise mixed adapter infer with missing adapter
 (#2090)

PEFT allows mixed batch adapter inference, i.e. when predicting, the
same batch can use different adapters by passing the adapter_names
argument. However, when users pass an adapter name that does not
correspond to any of the existing adapters, these samples are currently
being ignored (i.e. just the base model output is used). This is
unexpected and can easily lead to errors, e.g. when users mistype the
name of an adapter.

This PR fixes this issue by checking all the existing adapter names
first and comparing them to the adapter_names that the user passed. If
there are unexpected entries, an error is raised.

Due to this fix, an error in the test
test_mixed_adapter_batches_lora_merged_raises was discovered and
promptly fixed.
---
 src/peft/tuners/lora/model.py | 13 +++++++++++++
 tests/test_custom_models.py   | 29 ++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
index edb1273381..04294735d3 100644
--- a/src/peft/tuners/lora/model.py
+++ b/src/peft/tuners/lora/model.py
@@ -444,6 +444,19 @@ def _enable_peft_forward_hooks(self, *args, **kwargs):
         if self.training:
             raise ValueError("Cannot pass `adapter_names` when the model is in training mode.")
 
+        # Check that users only passed actually existing adapters.
+        # Note: We cannot do this on the layer level, as each individual layer may not have each adapter. Still, we want
+        # to check that there is at least one layer with the given name, or else something like typos can easily slip.
+        expected_adapters = set()
+        for layer in self.modules():
+            if isinstance(layer, LoraLayer):
+                expected_adapters |= layer.lora_A.keys()
+                expected_adapters |= layer.lora_embedding_A.keys()
+        unique_adapters = {name for name in adapter_names if name != "__base__"}
+        unexpected_adapters = unique_adapters - expected_adapters
+        if unexpected_adapters:
+            raise ValueError(f"Trying to infer with non-existing adapter(s): {', '.join(sorted(unexpected_adapters))}")
+
         hook_handles = []
         for module in self.modules():
             if isinstance(module, LoraLayer) or isinstance(module, ModulesToSaveWrapper):
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index aa747ad245..611b07bf97 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -3536,13 +3536,40 @@ def test_mixed_adapter_batches_lora_merged_raises(self, mlp_lora):
         # When there are merged adapters, passing adapter names should raise an error
         inputs = {
             "X": torch.arange(90).view(-1, 10).to(self.torch_device),
-            "adapter_names": ["default"] * 9,
+            "adapter_names": ["adapter0"] * 9,
         }
         mlp_lora.merge_adapter(["adapter0"])
         msg = r"Cannot pass `adapter_names` when there are merged adapters, please call `unmerge_adapter` first."
         with pytest.raises(ValueError, match=msg):
             mlp_lora.forward(**inputs)
 
+    def test_mixed_adapter_batches_lora_wrong_adapter_name_raises(self):
+        # Ensure that all of the adapter names that are being passed actually exist
+        torch.manual_seed(0)
+        x = torch.arange(90).view(-1, 10).to(self.torch_device)
+
+        base_model = MLP().to(self.torch_device).eval()
+        config = LoraConfig(target_modules=["lin0"], init_lora_weights=False)
+        peft_model = get_peft_model(base_model, config).eval()
+        peft_model.add_adapter(adapter_name="other", peft_config=config)
+
+        # sanity check: this works
+        peft_model.forward(x, adapter_names=["default"] * 5 + ["other"] * 4)
+
+        # check one correct and one incorrect adapter
+        msg = re.escape("Trying to infer with non-existing adapter(s): does-not-exist")
+        with pytest.raises(ValueError, match=msg):
+            peft_model.forward(x, adapter_names=["default"] * 5 + ["does-not-exist"] * 4)
+
+        # check two correct adapters and one incorrect adapter
+        with pytest.raises(ValueError, match=msg):
+            peft_model.forward(x, adapter_names=["default"] * 3 + ["does-not-exist"] * 4 + ["other"] * 2)
+
+        # check only incorrect adapters
+        msg = re.escape("Trying to infer with non-existing adapter(s): does-not-exist, other-does-not-exist")
+        with pytest.raises(ValueError, match=msg):
+            peft_model.forward(x, adapter_names=["does-not-exist"] * 5 + ["other-does-not-exist"] * 4)
+
     def test_mixed_adapter_batches_lora_with_dora_raises(self):
         # When there are DoRA adapters, passing adapter names should raise an error
         torch.manual_seed(0)

From 1eab9bd10f5099305ac08d9bf30dec3e753428c6 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Wed, 9 Oct 2024 17:21:03 +0200
Subject: [PATCH 18/22] FIX Prompt learning with latest transformers error
 (#2140)

The error in PEFT is occurring after this transformers change:

https://github.com/huggingface/transformers/pull/33870

Now, in our tests, some model_kwargs no longer necessarily contain
past_key_values, resulting in a KeyError. We now account for this
possibility. Affected models were opt and gpt2.
---
 src/peft/peft_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
index 26c4cf1fdb..7f41436045 100644
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@@ -1756,7 +1756,7 @@ def prepare_inputs_for_generation(self, *args, task_ids: Optional[torch.Tensor]
         if peft_config.peft_type == PeftType.POLY:
             model_kwargs["task_ids"] = task_ids
         if peft_config.is_prompt_learning:
-            if uses_cache and (model_kwargs["past_key_values"] is not None):
+            if uses_cache and (model_kwargs.get("past_key_values", None) is not None):
                 # change in the logic of `prepare_inputs_for_generation` makes the below code necessary
                 # In prompt learning methods, past key values are longer when compared to the `input_ids`.
                 # As such only consider the last input ids in the autogressive generation phase.
@@ -1786,7 +1786,7 @@ def prepare_inputs_for_generation(self, *args, task_ids: Optional[torch.Tensor]
                 kwargs["token_type_ids"] = None
 
             # no past_key_values or past_key_values empty cache
-            requires_prompt_injection = (model_kwargs["past_key_values"] is None) or (
+            requires_prompt_injection = (model_kwargs.get("past_key_values", None) is None) or (
                 isinstance(model_kwargs["past_key_values"], transformers.Cache)
                 and not model_kwargs["past_key_values"].get_seq_length()
             )

From 5758a7eb1c156b2c513930af666acb2d4a2cb3c3 Mon Sep 17 00:00:00 2001
From: JINO ROHIT <find.jinorohit@gmail.com>
Date: Thu, 10 Oct 2024 14:34:16 +0530
Subject: [PATCH 19/22] ENH LoRA notebook for NER task (#2126)

---
 .../token_classification/peft_lora_ner.ipynb  | 780 ++++++++++++++++++
 1 file changed, 780 insertions(+)
 create mode 100644 examples/token_classification/peft_lora_ner.ipynb

diff --git a/examples/token_classification/peft_lora_ner.ipynb b/examples/token_classification/peft_lora_ner.ipynb
new file mode 100644
index 0000000000..fae9b94b4e
--- /dev/null
+++ b/examples/token_classification/peft_lora_ner.ipynb
@@ -0,0 +1,780 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Named Entity Recognition with Peft Model 🤗\n",
+    "\n",
+    "##### In this notebook, we will learn how to perform Named Entity Recognition(NER) on the CoNLL-2003 dataset using the Trainer class\n",
+    "\n",
+    "##### This notebook has been adapted from the main NLP course here - https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt#fine-tuning-the-model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#install the required libraries\n",
+    "!pip install -q datasets evaluate transformers seqeval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import required libraries\n",
+    "from datasets import load_dataset\n",
+    "from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer, pipeline\n",
+    "from peft import get_peft_model, LoraConfig, TaskType\n",
+    "import evaluate\n",
+    "import numpy as np\n",
+    "from huggingface_hub import notebook_login"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DatasetDict({\n",
+      "    train: Dataset({\n",
+      "        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n",
+      "        num_rows: 14041\n",
+      "    })\n",
+      "    validation: Dataset({\n",
+      "        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n",
+      "        num_rows: 3250\n",
+      "    })\n",
+      "    test: Dataset({\n",
+      "        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n",
+      "        num_rows: 3453\n",
+      "    })\n",
+      "})\n"
+     ]
+    }
+   ],
+   "source": [
+    "raw_datasets = load_dataset(\"conll2003\")\n",
+    "print(raw_datasets)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Look at the tokens of the first training example\n",
+    "raw_datasets[\"train\"][0][\"tokens\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[3, 0, 7, 0, 0, 0, 7, 0, 0]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Look at the NER tags of the first training example\n",
+    "raw_datasets[\"train\"][0][\"ner_tags\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Get the label names for the NER tags\n",
+    "ner_feature = raw_datasets[\"train\"].features[\"ner_tags\"]\n",
+    "label_names = ner_feature.feature.names\n",
+    "label_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "EU    rejects German call to boycott British lamb . \n",
+      "B-ORG O       B-MISC O    O  O       B-MISC  O    O \n"
+     ]
+    }
+   ],
+   "source": [
+    "words = raw_datasets[\"train\"][0][\"tokens\"]\n",
+    "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n",
+    "line1 = \"\"\n",
+    "line2 = \"\"\n",
+    "for word, label in zip(words, labels):\n",
+    "    full_label = label_names[label]\n",
+    "    max_length = max(len(word), len(full_label))\n",
+    "    line1 += word + \" \" * (max_length - len(word) + 1)\n",
+    "    line2 += full_label + \" \" * (max_length - len(full_label) + 1)\n",
+    "\n",
+    "print(line1)\n",
+    "print(line2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "e:\\open_source\\peft-folder\\ner-examples\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load the tokenizer\n",
+    "model_checkpoint = \"bert-base-cased\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['[CLS]',\n",
+       " 'EU',\n",
+       " 'rejects',\n",
+       " 'German',\n",
+       " 'call',\n",
+       " 'to',\n",
+       " 'boycott',\n",
+       " 'British',\n",
+       " 'la',\n",
+       " '##mb',\n",
+       " '.',\n",
+       " '[SEP]']"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Tokenize the first training example\n",
+    "inputs = tokenizer(raw_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)\n",
+    "inputs.tokens()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def align_labels_with_tokens(labels, word_ids):\n",
+    "    new_labels = []\n",
+    "    current_word = None\n",
+    "    for word_id in word_ids:\n",
+    "        if word_id != current_word:\n",
+    "            # Start of a new word!\n",
+    "            current_word = word_id\n",
+    "            label = -100 if word_id is None else labels[word_id]\n",
+    "            new_labels.append(label)\n",
+    "        elif word_id is None:\n",
+    "            # Special token\n",
+    "            new_labels.append(-100)\n",
+    "        else:\n",
+    "            # Same word as previous token\n",
+    "            label = labels[word_id]\n",
+    "            # If the label is B-XXX we change it to I-XXX\n",
+    "            if label % 2 == 1:\n",
+    "                label += 1\n",
+    "            new_labels.append(label)\n",
+    "\n",
+    "    return new_labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[3, 0, 7, 0, 0, 0, 7, 0, 0]\n",
+      "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]\n"
+     ]
+    }
+   ],
+   "source": [
+    "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n",
+    "word_ids = inputs.word_ids()\n",
+    "print(labels)\n",
+    "print(align_labels_with_tokens(labels, word_ids))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_and_align_labels(examples):\n",
+    "    tokenized_inputs = tokenizer(\n",
+    "        examples[\"tokens\"], truncation=True, is_split_into_words=True\n",
+    "    )\n",
+    "    all_labels = examples[\"ner_tags\"]\n",
+    "    new_labels = []\n",
+    "    for i, labels in enumerate(all_labels):\n",
+    "        word_ids = tokenized_inputs.word_ids(i)\n",
+    "        new_labels.append(align_labels_with_tokens(labels, word_ids))\n",
+    "\n",
+    "    tokenized_inputs[\"labels\"] = new_labels\n",
+    "    return tokenized_inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenized_datasets = raw_datasets.map(\n",
+    "    tokenize_and_align_labels,\n",
+    "    batched=True,\n",
+    "    remove_columns=raw_datasets[\"train\"].column_names,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]\n",
+      "[-100, 1, 2, -100]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(2):\n",
+    "    print(tokenized_datasets[\"train\"][i][\"labels\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metric = evaluate.load(\"seqeval\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create label mappings\n",
+    "id2label = {i: label for i, label in enumerate(label_names)}\n",
+    "label2id = {v: k for k, v in id2label.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load the pre-trained model\n",
+    "model = AutoModelForTokenClassification.from_pretrained(\n",
+    "    model_checkpoint,\n",
+    "    id2label=id2label,\n",
+    "    label2id=label2id,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "9"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.config.num_labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "BertForTokenClassification(\n",
+       "  (bert): BertModel(\n",
+       "    (embeddings): BertEmbeddings(\n",
+       "      (word_embeddings): Embedding(28996, 768, padding_idx=0)\n",
+       "      (position_embeddings): Embedding(512, 768)\n",
+       "      (token_type_embeddings): Embedding(2, 768)\n",
+       "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (encoder): BertEncoder(\n",
+       "      (layer): ModuleList(\n",
+       "        (0-11): 12 x BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSdpaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (dropout): Dropout(p=0.1, inplace=False)\n",
+       "  (classifier): Linear(in_features=768, out_features=9, bias=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 301,833 || all params: 108,028,434 || trainable%: 0.2794\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Configure LoRA (Low-Rank Adaptation) for fine-tuning\n",
+    "peft_config = LoraConfig(target_modules = [\"query\", \"key\"], task_type = TaskType.TOKEN_CLS)\n",
+    "\n",
+    "model = get_peft_model(model, peft_config)\n",
+    "model.print_trainable_parameters()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_metrics(eval_preds):\n",
+    "    logits, labels = eval_preds\n",
+    "    predictions = np.argmax(logits, axis=-1)\n",
+    "\n",
+    "    # Remove ignored index (special tokens) and convert to labels\n",
+    "    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]\n",
+    "    true_predictions = [\n",
+    "        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]\n",
+    "        for prediction, label in zip(predictions, labels)\n",
+    "    ]\n",
+    "    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)\n",
+    "    return {\n",
+    "        \"precision\": all_metrics[\"overall_precision\"],\n",
+    "        \"recall\": all_metrics[\"overall_recall\"],\n",
+    "        \"f1\": all_metrics[\"overall_f1\"],\n",
+    "        \"accuracy\": all_metrics[\"overall_accuracy\"],\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "60bd54dd23de4822891a157430ff47b9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "e:\\open_source\\peft-folder\\ner-examples\\.venv\\Lib\\site-packages\\transformers\\training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "args = TrainingArguments(\n",
+    "    \"bert-finetuned-ner-lora\",\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    per_device_train_batch_size=32, # decrease this for OOM error\n",
+    "    per_device_eval_batch_size=64,\n",
+    "    save_strategy=\"epoch\",\n",
+    "    learning_rate=2e-3,\n",
+    "    num_train_epochs=5,\n",
+    "    weight_decay=0.01,\n",
+    "    load_best_model_at_end=True,\n",
+    "    do_eval=True,\n",
+    "    do_predict=True,\n",
+    "    metric_for_best_model=\"accuracy\",\n",
+    "    label_names=[\"labels\"],\n",
+    "    push_to_hub=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "52a6470e20474df6808fcea70c8533ff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2195 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "e:\\open_source\\peft-folder\\ner-examples\\.venv\\Lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:440: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
+      "  attn_output = torch.nn.functional.scaled_dot_product_attention(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7944af882c2543238c82a6e9fd65209e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/51 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.09396398812532425, 'eval_precision': 0.8329652996845426, 'eval_recall': 0.8887579939414338, 'eval_f1': 0.859957661618629, 'eval_accuracy': 0.9724936716312474, 'eval_runtime': 5.7905, 'eval_samples_per_second': 561.264, 'eval_steps_per_second': 8.808, 'epoch': 1.0}\n",
+      "{'loss': 0.1864, 'grad_norm': 0.36243128776550293, 'learning_rate': 0.0015444191343963554, 'epoch': 1.14}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "79c0a83640aa44a1b7c1fb61de11f618",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/51 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.07848585397005081, 'eval_precision': 0.8617173809144496, 'eval_recall': 0.9102995624368899, 'eval_f1': 0.8853424993862018, 'eval_accuracy': 0.9782333549184671, 'eval_runtime': 4.8142, 'eval_samples_per_second': 675.093, 'eval_steps_per_second': 10.594, 'epoch': 2.0}\n",
+      "{'loss': 0.0867, 'grad_norm': 0.23459061980247498, 'learning_rate': 0.0010888382687927107, 'epoch': 2.28}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7af499f0415a41e59ba46c078f837fa0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/51 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.07186892628669739, 'eval_precision': 0.8772632590930941, 'eval_recall': 0.9214069336923595, 'eval_f1': 0.8987934006402364, 'eval_accuracy': 0.9796903514452228, 'eval_runtime': 4.8141, 'eval_samples_per_second': 675.103, 'eval_steps_per_second': 10.594, 'epoch': 3.0}\n",
+      "{'loss': 0.066, 'grad_norm': 0.2663992941379547, 'learning_rate': 0.0006332574031890661, 'epoch': 3.42}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "40d883200bb3473b8dd911177685965f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/51 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.06592053174972534, 'eval_precision': 0.8809218950064021, 'eval_recall': 0.9262874453046113, 'eval_f1': 0.9030352748154225, 'eval_accuracy': 0.9809854594690057, 'eval_runtime': 4.8969, 'eval_samples_per_second': 663.691, 'eval_steps_per_second': 10.415, 'epoch': 4.0}\n",
+      "{'loss': 0.0505, 'grad_norm': 0.1911543756723404, 'learning_rate': 0.0001776765375854214, 'epoch': 4.56}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0eaa7fa8947f46a3af1e21d12bf730f8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/51 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.06525895744562149, 'eval_precision': 0.8831439090763566, 'eval_recall': 0.928475260854931, 'eval_f1': 0.9052424317007138, 'eval_accuracy': 0.981515276387826, 'eval_runtime': 4.8357, 'eval_samples_per_second': 672.083, 'eval_steps_per_second': 10.547, 'epoch': 5.0}\n",
+      "{'train_runtime': 213.3684, 'train_samples_per_second': 329.032, 'train_steps_per_second': 10.287, 'train_loss': 0.09303037256749182, 'epoch': 5.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No files have been modified since last commit. Skipping to prevent empty commit.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=2195, training_loss=0.09303037256749182, metrics={'train_runtime': 213.3684, 'train_samples_per_second': 329.032, 'train_steps_per_second': 10.287, 'total_flos': 1949278889622816.0, 'train_loss': 0.09303037256749182, 'epoch': 5.0})"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=args,\n",
+    "    train_dataset=tokenized_datasets[\"train\"],\n",
+    "    eval_dataset=tokenized_datasets[\"validation\"],\n",
+    "    data_collator=data_collator,\n",
+    "    tokenizer=tokenizer,\n",
+    "    compute_metrics=compute_metrics\n",
+    ")\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'entity_group': 'PER',\n",
+       "  'score': np.float32(0.9957055),\n",
+       "  'word': 'Jino',\n",
+       "  'start': 11,\n",
+       "  'end': 15}]"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Replace this with your own checkpoint\n",
+    "model_checkpoint = \"bert-finetuned-ner-lora\"\n",
+    "token_classifier = pipeline(\n",
+    "    \"token-classification\", model = model_checkpoint, aggregation_strategy = \"simple\"\n",
+    ")\n",
+    "token_classifier(\"My name is Jino.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 0aa7e3a221732b76b64a8d4124092e719cc7649d Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Thu, 10 Oct 2024 14:40:54 +0200
Subject: [PATCH 20/22] FIX TST NaN issue with HQQ GPU test (#2143)

This test calculates the correlation coefficient of HQQ model outputs.
Although the model outputs are finite, the resulting matrix contains
NaNs. Casting the outputs from 16 to 32 bit precision resolves the
issue.
---
 tests/test_gpu_examples.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py
index afa6a15d9c..0cba89d120 100644
--- a/tests/test_gpu_examples.py
+++ b/tests/test_gpu_examples.py
@@ -2917,18 +2917,18 @@ def test_hqq_lora_model_outputs(self):
             output_hqq = model(**inputs).logits
 
         # check that outputs of HQQ are highly correlated; there are outliers, so don't check for equality
-        cc_matrix = torch.corrcoef(torch.stack((output_normal.flatten(), output_hqq.flatten())))
+        cc_matrix = torch.corrcoef(torch.stack((output_normal.float().flatten(), output_hqq.float().flatten())))
         assert cc_matrix.min() > 0.97
 
         # check that outputs are the same after merging
-        cc_matrix = torch.corrcoef(torch.stack((output_normal.flatten(), output_hqq.flatten())))
+        cc_matrix = torch.corrcoef(torch.stack((output_normal.float().flatten(), output_hqq.float().flatten())))
         assert cc_matrix.min() > 0.97
 
         # check outputs are the same after unmerging
         model.unmerge_adapter()
         with torch.inference_mode():
             output_unmerged = model(**inputs).logits
-        cc_matrix = torch.corrcoef(torch.stack((output_normal.flatten(), output_unmerged.flatten())))
+        cc_matrix = torch.corrcoef(torch.stack((output_normal.float().flatten(), output_unmerged.float().flatten())))
         assert cc_matrix.min() > 0.97
 
         # check that the results are the same after saving and loading
@@ -2957,7 +2957,9 @@ def test_hqq_lora_model_outputs(self):
         model = model.merge_and_unload()
         with torch.inference_mode():
             output_merged_unloaded = model(**inputs).logits
-        cc_matrix = torch.corrcoef(torch.stack((output_normal.flatten(), output_merged_unloaded.flatten())))
+        cc_matrix = torch.corrcoef(
+            torch.stack((output_normal.float().flatten(), output_merged_unloaded.float().flatten()))
+        )
         assert cc_matrix.min() > 0.97
 
 

From c925d0ae25a1fef79a2b29a71fae7c983e40a61e Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Thu, 10 Oct 2024 16:43:28 +0200
Subject: [PATCH 21/22] FIX Bug in target module optimization if suffix (#2144)

Solves the following bug:

https://github.com/huggingface/diffusers/pull/9622#issuecomment-2404789721

The cause for the bug is as follows: When we have, say, a module called
"bar.0.query" that we want to target and another module called
"foo_bar.0.query" that we don't want to target, there was potential for
an error. This is not caused by _find_minimal_target_modules directly,
but rather the bug was inside of BaseTuner.inject_adapter and how the
names_no_target were chosen. Those used to be chosen based on suffix. In
our example, however, "bar.0.query" is a suffix of "foo_bar.0.query",
therefore "foo_bar.0.query" was *not* added to names_no_target when it
should have. As a consequence, during the optimization, it looks like
"query" is safe to use as target_modules because we don't see that it
wrongly matches "foo_bar.0.query".
---
 src/peft/tuners/tuners_utils.py |  4 +++-
 tests/test_tuners_utils.py      | 42 +++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
index 51994b456f..405277b6b5 100644
--- a/src/peft/tuners/tuners_utils.py
+++ b/src/peft/tuners/tuners_utils.py
@@ -461,7 +461,9 @@ def inject_adapter(
             and len(peft_config.target_modules) >= MIN_TARGET_MODULES_FOR_OPTIMIZATION
         ):
             names_no_target = [
-                name for name in key_list if not any(name.endswith(suffix) for suffix in peft_config.target_modules)
+                name
+                for name in key_list
+                if not any((name == suffix) or name.endswith("." + suffix) for suffix in peft_config.target_modules)
             ]
             new_target_modules = _find_minimal_target_modules(peft_config.target_modules, names_no_target)
             if len(new_target_modules) < len(peft_config.target_modules):
diff --git a/tests/test_tuners_utils.py b/tests/test_tuners_utils.py
index 5e742f3c88..06a47deb26 100644
--- a/tests/test_tuners_utils.py
+++ b/tests/test_tuners_utils.py
@@ -1400,3 +1400,45 @@ def test_suffix_is_substring_of_other_suffix(self):
         expected = {"time_emb_proj", "proj", "proj_out"}
         result = find_minimal_target_modules(target_modules, other_module_names)
         assert result == expected
+
+    def test_get_peft_modules_module_name_is_suffix_of_another_module(self):
+        # Solves the following bug:
+        # https://github.com/huggingface/diffusers/pull/9622#issuecomment-2404789721
+
+        # The cause for the bug is as follows: When we have, say, a module called "bar.0.query" that we want to target
+        # and another module called "foo_bar.0.query" that we don't want to target, there was potential for an error.
+        # This is not caused by _find_minimal_target_modules directly, but rather the bug was inside of
+        # BaseTuner.inject_adapter and how the names_no_target were chosen. Those used to be chosen based on suffix. In
+        # our example, however, "bar.0.query" is a suffix of "foo_bar.0.query", therefore "foo_bar.0.query" was *not*
+        # added to names_no_target when it should have. As a consequence, during the optimization, it looks like "query"
+        # is safe to use as target_modules because we don't see that it wrongly matches "foo_bar.0.query".
+
+        # ensure that we have sufficiently many modules to trigger the optimization
+        n_layers = MIN_TARGET_MODULES_FOR_OPTIMIZATION + 1
+
+        class InnerModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.query = nn.Linear(10, 10)
+
+        class OuterModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                # note that "transformer_blocks" is a suffix of "single_transformer_blocks"
+                self.transformer_blocks = nn.ModuleList([InnerModule() for _ in range(n_layers)])
+                self.single_transformer_blocks = nn.ModuleList([InnerModule() for _ in range(n_layers)])
+
+        # we want to match all "transformer_blocks" layers but not "single_transformer_blocks"
+        target_modules = [f"transformer_blocks.{i}.query" for i in range(n_layers)]
+        model = get_peft_model(OuterModule(), LoraConfig(target_modules=target_modules))
+
+        # sanity check: we should have n_layers PEFT layers in model.transformer_blocks
+        transformer_blocks = model.base_model.model.transformer_blocks
+        assert sum(isinstance(module, BaseTunerLayer) for module in transformer_blocks.modules()) == n_layers
+
+        # we should not have any PEFT layers in model.single_transformer_blocks
+        single_transformer_blocks = model.base_model.model.single_transformer_blocks
+        assert not any(isinstance(module, BaseTunerLayer) for module in single_transformer_blocks.modules())
+
+        # target modules should *not* be simplified to "query" as that would match "single_transformers_blocks" too
+        assert model.peft_config["default"].target_modules != {"query"}

From 749b92456218f7dddc8f7a9aa27a41815b3d6c2e Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Fri, 11 Oct 2024 20:50:05 +0200
Subject: [PATCH 22/22] Bump version to 0.13.2.dev0 (#2145)

After the patch release of PEFT v0.13.2, let's bump the dev version of
PEFT to v0.13.3.dev0 so that it stays ahead (the bugfix from the patch
release is already contained in the main branch).
---
 setup.py             | 2 +-
 src/peft/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index d1d4adbc3e..443ff8e3b6 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 from setuptools import find_packages, setup
 
 
-VERSION = "0.13.2.dev0"
+VERSION = "0.13.3.dev0"
 
 extras = {}
 extras["quality"] = [
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 32d7a1a43d..6b2908b35d 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.13.2.dev0"
+__version__ = "0.13.3.dev0"
 
 from .auto import (
     AutoPeftModel,