From 07f0b86e0d590297d07dc86ce367e07692255a38 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Mon, 28 Oct 2024 09:53:21 +0400
Subject: [PATCH 1/3] Restore SDPA in Gemma2 models for transformers > 4.45

---
 optimum/exporters/openvino/model_patcher.py | 20 ++++++++++++++++++++
 tests/openvino/test_modeling.py             |  8 ++++++++
 2 files changed, 28 insertions(+)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 3bc9452ff9..7e5cd76a76 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -2505,6 +2505,26 @@ def patched_forward(*args, **kwargs):
 
         self.patched_forward = patched_forward
 
+    def __enter__(self):
+        super().__enter__()
+        if is_transformers_version(">=", "4.45.0"):
+            from transformers.models.gemma2.modeling_gemma2 import GEMMA2_ATTENTION_CLASSES
+
+            sdpa_attn = GEMMA2_ATTENTION_CLASSES["sdpa"]
+            eager_attn = GEMMA2_ATTENTION_CLASSES["eager"]
+
+            for layer in self._model.model.layers:
+                if isinstance(layer.self_attn, eager_attn):
+                    layer.self_attn._orig_forward = layer.self_attn.forward
+                    layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if is_transformers_version(">=", "4.45.0"):
+            for layer in self._model.model.layers:
+                if hasattr(layer.self_attn, "_orig_forward"):
+                    layer.self_attn.forward = layer.self_attn._orig_forward
+
 
 def _decilm_attn_forward(
     self,
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 119e004035..a139d88025 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -863,6 +863,10 @@ def test_compare_to_transformers(self, model_arch):
         if model_arch in self.REMOTE_CODE_MODELS:
             model_kwargs = {"trust_remote_code": True}
 
+        # starting from transformers 4.45.0 gemma2 uses eager attention by default, while ov - sdpa
+        if model_arch == "gemma2" and is_transformers_version(">=", "4.45.0"):
+            model_kwargs["attn_implemenation"] = "sdpa"
+
         ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs)
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         self.assertTrue(ov_model.use_cache)
@@ -1094,6 +1098,10 @@ def test_beam_search(self, model_arch):
                 "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True),
                 "trust_remote_code": True,
             }
+
+        # starting from transformers 4.45.0 gemma2 uses eager attention by default, while ov - sdpa
+        if model_arch == "gemma2" and is_transformers_version(">=", "4.45.0"):
+            model_kwargs["attn_implemenation"] = "sdpa"
         # Qwen tokenizer does not support padding, chatglm, glm4 testing models produce nan that incompatible with beam search
         if model_arch in ["qwen", "chatglm", "glm4"]:
             return

From 86173d99fbc188e84bcd0281c0797c3aa5414faf Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Mon, 28 Oct 2024 12:57:34 +0400
Subject: [PATCH 2/3] Update tests/openvino/test_modeling.py

---
 tests/openvino/test_modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index a139d88025..9a3d251fc7 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -865,7 +865,7 @@ def test_compare_to_transformers(self, model_arch):
 
         # starting from transformers 4.45.0 gemma2 uses eager attention by default, while ov - sdpa
         if model_arch == "gemma2" and is_transformers_version(">=", "4.45.0"):
-            model_kwargs["attn_implemenation"] = "sdpa"
+            model_kwargs["attn_implementation"] = "sdpa"
 
         ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs)
         self.assertIsInstance(ov_model.config, PretrainedConfig)

From e21c951fca40e4dd51d035730e208eb1c51c980e Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Mon, 28 Oct 2024 14:45:37 +0400
Subject: [PATCH 3/3] Update tests/openvino/test_modeling.py

---
 tests/openvino/test_modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 9a3d251fc7..082ffef285 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -1101,7 +1101,7 @@ def test_beam_search(self, model_arch):
 
         # starting from transformers 4.45.0 gemma2 uses eager attention by default, while ov - sdpa
         if model_arch == "gemma2" and is_transformers_version(">=", "4.45.0"):
-            model_kwargs["attn_implemenation"] = "sdpa"
+            model_kwargs["attn_implementation"] = "sdpa"
         # Qwen tokenizer does not support padding, chatglm, glm4 testing models produce nan that incompatible with beam search
         if model_arch in ["qwen", "chatglm", "glm4"]:
             return