From 71ae2daa6a20101d304e87523a6052fe3d0025db Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:43:04 +0200
Subject: [PATCH] Support Transformers 4.43 (#856)

* install from pr

* updates

* fix

* update TRANSFORMERS_MAX_VERSION

* fix sdpa in training

* fix whisper

* fix

* whisper calibration checks

* fix OVTrainerTextClassificationTrainingTest's expected fake quantize

* fix OVCLIExportTestCase's expected_int4

* update min ci transformers version to 4.37

* fix OVQuantizerTest's expected fake quantize

* reorder_cache

* fix expected compressed matmuls

* fix test_exporters_cli_int4_with_local_model_and_default_config

* fix qwen custom modeling test

* fix failing ipex tests

* fix ipex

* fix the last ipex failing test_compare_with_and_without_past_key_values

* use minimal prepare_inputs_for_generation in OVModelForSpeechSeq2Seq

* keeping compatibility with transformers 4.36

* keep support of whisper using WhisperGenerationMixin.generate a,d dummy model fix

* trigger

* fix

* device property

* standardize .device and ._device attributes/properties

* fix

* fix

* revert

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* use falcon

* torch.device property always cpu

* style

* resolve conflicts

* decoder_attention_mask for older versions

* optimum main

* limit inc transformers version

* fix pipeline missing dtype

* add dtype for seq to seq models

* pass phi beam search test and skip internlm2

* fix for internlm2

---------

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 .github/workflows/test_ipex.yml               |  42 +-
 .github/workflows/test_openvino.yml           |  57 +-
 .github/workflows/test_openvino_basic.yml     |  64 ++-
 optimum/exporters/ipex/model_patcher.py       |   2 +-
 optimum/intel/ipex/modeling_base.py           |   6 +-
 optimum/intel/openvino/modeling.py            |   1 -
 optimum/intel/openvino/modeling_base.py       |  35 +-
 .../intel/openvino/modeling_base_seq2seq.py   |   2 +
 optimum/intel/openvino/modeling_diffusion.py  |  23 +-
 optimum/intel/openvino/modeling_seq2seq.py    | 531 ++++--------------
 optimum/intel/openvino/utils.py               |  17 +-
 setup.py                                      |   8 +-
 tests/ipex/test_modeling.py                   |  19 +-
 tests/ipex/test_pipelines.py                  |   4 +-
 tests/openvino/test_exporters_cli.py          |  18 +-
 tests/openvino/test_modeling.py               |  13 +-
 tests/openvino/test_quantization.py           |  10 +-
 tests/openvino/test_stable_diffusion.py       |   6 +-
 18 files changed, 300 insertions(+), 558 deletions(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 96ef047aaf..8b97bdd535 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -17,26 +17,32 @@ concurrency:
 
 jobs:
   build:
+    runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        transformers-version: [4.39.0, 4.41.2]
-        os: [ubuntu-latest]
+        python-version: [3.9]
+        transformers-version: ["4.39.0", "4.43.*"]
+        ipex-version: ["2.2.0", "2.3.*"]
+        include:
+          - python-version: 3.8
+            transformers-version: 4.39.0
+            ipex-version: 2.2.0
 
-    runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install torch torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
-        pip install .[ipex,tests]
-        pip install transformers==${{ matrix.transformers-version }}
-    - name: Test with Pytest
-      run: |
-        pytest tests/ipex/
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install torch==${{ matrix.ipex-version }} --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install intel_extension_for_pytorch==${{ matrix.ipex-version }}
+          pip install Pillow parameterized
+          pip install transformers[testing]==${{ matrix.transformers-version }}
+          pip install .[ipex]
+      - name: Test with Pytest
+        run: |
+          pytest tests/ipex/
diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 6f9675cde7..13a6b83e57 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -21,36 +21,37 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ["3.8", "3.12"]
-        transformers-version: ["4.36.0", "4.42.*"]
+        transformers-version: ["4.36.0", "4.43.*"]
         os: [ubuntu-latest]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v4
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        # install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-        pip install transformers==${{ matrix.transformers-version }}
-        pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
-    - name: Test with Pytest
-      env:
-        HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-      run: |
-        pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0
-    - name: Test basic
-      run: |
-        pip uninstall -y nncf
-        pytest tests/openvino/test_modeling_basic.py
-    - name: Test openvino-nightly
-      run: |
-        pip uninstall -y openvino
-        pip install openvino-nightly
-        python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)"
-        optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov
+      - uses: actions/checkout@v4
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
 
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          # install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU
+          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
+          pip install transformers==${{ matrix.transformers-version }}
+
+      - name: Test with Pytest
+        env:
+          HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+        run: |
+          pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0
+      - name: Test basic
+        run: |
+          pip uninstall -y nncf
+          pytest tests/openvino/test_modeling_basic.py
+      - name: Test openvino-nightly
+        run: |
+          pip uninstall -y openvino
+          pip install openvino-nightly
+          python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)"
+          optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov
diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml
index 141b94425e..28c8369c75 100644
--- a/.github/workflows/test_openvino_basic.yml
+++ b/.github/workflows/test_openvino_basic.yml
@@ -3,7 +3,7 @@ name: OpenVINO - Basic Test
 on:
   workflow_dispatch:
   schedule:
-    - cron:  '41 1 * * *'  # run every day at 1:41
+    - cron: "41 1 * * *" # run every day at 1:41
   push:
     branches:
       - v*-release
@@ -23,36 +23,42 @@ jobs:
         # Testing lower and upper bound of supported Python versions
         # This also ensures that the test fails if dependencies break for Python 3.7
         python-version: ["3.8", "3.12"]
-        optimum: ['optimum', 'git+https://github.com/huggingface/optimum.git']
         os: ["ubuntu-22.04", "windows-latest"]
+        transformers-version: ["4.43.*"]
+        include:
+          - python-version: "3.12"
+            os: "ubuntu-22.04"
+            transformers-version: "4.36.0"
 
     runs-on: ${{ matrix.os }}
 
     steps:
-    - uses: actions/checkout@v4
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
-      with:
-        python-version: ${{ matrix.python-version }}
-
-    - name: Install dependencies
-      run: |
-        # Install openvino manually to prevent dependency conflicts when .[openvino] pins
-        # optimum or transformers to a specific version
-        # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-        pip install .[tests] openvino ${{ matrix.optimum}}
-
-    - name: Pip freeze
-      run: pip freeze
-
-    - name: Test with Pytest
-      run: |
-        pytest tests/openvino/test_modeling_basic.py
-
-    - name: Slow tests
-      run: |
-        pip install nncf
-        pytest tests/openvino -s -m "run_slow" --durations=0
-      env:
-        RUN_SLOW: 1
+      - uses: actions/checkout@v4
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
+          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          # Install openvino manually to prevent dependency conflicts when .[openvino] pins
+          # optimum or transformers to a specific version
+          pip install .[tests] openvino
+          pip install transformers==${{ matrix.transformers-version }}
+
+      - name: Pip freeze
+        run: pip freeze
+
+      - name: Test with Pytest
+        run: |
+          pytest tests/openvino/test_modeling_basic.py
+
+      - name: Slow tests
+        run: |
+          pip install nncf
+          pytest tests/openvino -s -m "run_slow" --durations=0
+        env:
+          RUN_SLOW: 1
diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
index d1c6668b7e..9c72a0eeda 100644
--- a/optimum/exporters/ipex/model_patcher.py
+++ b/optimum/exporters/ipex/model_patcher.py
@@ -31,7 +31,7 @@
 
 # Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version
 _TRANSFORMERS_MIN_VERSION = "4.39.0"
-_TRANSFORMERS_MAX_VERSION = "4.41.2"
+_TRANSFORMERS_MAX_VERSION = "4.43.99"
 
 _IPEX_EXPORTED_ARCH = ("LlamaForCausalLM",)
 _IPEX_EXPORTED_TASK = ("text-generation",)
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index aa57ec20fa..f42cd8908a 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -470,9 +470,11 @@ def __init__(
             self._reorder_cache = _ipex_reorder_cache
         else:
             # Check if _reorder_cache is a static method
-            if isinstance(self.model_cls.__dict__["_reorder_cache"], staticmethod):
+            if "_reorder_cache" in self.model_cls.__dict__ and isinstance(
+                self.model_cls.__dict__["_reorder_cache"], staticmethod
+            ):
                 self._reorder_cache = self.model_cls._reorder_cache
-            else:
+            elif "_reorder_cache" in self.model_cls.__dict__:
                 self._reorder_cache = self.model_cls._reorder_cache.__get__(self)
 
         if is_transformers_version(">=", "4.38.0") and model_type in {"llama", "phi", "persimmon"}:
diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py
index 5e0fe61f8a..ba2d264346 100644
--- a/optimum/intel/openvino/modeling.py
+++ b/optimum/intel/openvino/modeling.py
@@ -129,7 +129,6 @@ def __init__(self, model: openvino.runtime.Model, config: transformers.Pretraine
         # Avoid warnings when creating a transformers pipeline
         AutoConfig.register(self.base_model_prefix, AutoConfig)
         self.auto_model_class.register(AutoConfig, self.__class__)
-        self.device = torch.device("cpu")
 
     def to(self, device: str):
         """
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index b90e6285a9..90c43b7805 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -20,6 +20,7 @@
 from typing import Dict, Optional, Union
 
 import openvino
+import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from openvino import Core, convert_model
@@ -34,7 +35,7 @@
 from ...exporters.openvino import export, main_export
 from ..utils.import_utils import is_nncf_available
 from .configuration import OVConfig, OVDynamicQuantizationConfig, OVWeightQuantizationConfig
-from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, _print_compiled_model_properties
+from .utils import ONNX_WEIGHTS_NAME, OV_TO_PT_TYPE, OV_XML_FILE_NAME, _print_compiled_model_properties
 
 
 core = Core()
@@ -77,16 +78,27 @@ def __init__(
             model = self._reshape(model, -1, -1, height, width)
 
         input_names = {}
+        input_dtypes = {}
         for idx, key in enumerate(model.inputs):
             names = tuple(key.get_names())
             input_names[next((name for name in names if "/" not in name), names[0])] = idx
+            input_dtypes[
+                next((name for name in names if "/" not in name), names[0])
+            ] = key.get_element_type().get_type_name()
         self.input_names = input_names
+        self.input_dtypes = input_dtypes
 
         output_names = {}
+        output_dtypes = {}
         for idx, key in enumerate(model.outputs):
             names = tuple(key.get_names())
             output_names[next((name for name in names if "/" not in name), names[0])] = idx
+            output_dtypes[
+                next((name for name in names if "/" not in name), names[0])
+            ] = key.get_element_type().get_type_name()
+
         self.output_names = output_names
+        self.output_dtypes = output_dtypes
 
         self.model = model
         self.request = None
@@ -103,6 +115,27 @@ def __init__(
         if enable_compilation:
             self.compile()
 
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (for torch compatibility).
+        """
+        return torch.device("cpu")
+
+    @property
+    def dtype(self) -> Optional[torch.dtype]:
+        for dtype in self.input_dtypes.values():
+            torch_dtype = OV_TO_PT_TYPE.get(dtype)
+            if torch_dtype.is_floating_point:
+                return torch_dtype
+
+        for dtype in self.output_dtypes.values():
+            torch_dtype = OV_TO_PT_TYPE.get(dtype)
+            if torch_dtype.is_floating_point:
+                return torch_dtype
+
+        return None
+
     @staticmethod
     def load_model(
         file_name: Union[str, Path],
diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
index 718b2f874e..95ffbc930a 100644
--- a/optimum/intel/openvino/modeling_base_seq2seq.py
+++ b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -350,6 +350,8 @@ def _reshape(self, model: openvino.runtime.Model, batch_size: int, sequence_leng
             shapes[inputs][0] = batch_size if not is_decoder else -1
             if inputs.get_any_name().startswith("past_key_values"):
                 shapes[inputs][2] = -1
+            elif inputs.get_any_name().startswith("cache_position"):
+                shapes[inputs][0] = sequence_length
             elif is_decoder and not inputs.get_any_name().startswith("encoder"):
                 shapes[inputs][1] = -1
             else:
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 1b880e736c..e58f6156bb 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -25,6 +25,7 @@
 import numpy as np
 import openvino
 import PIL
+import torch
 from diffusers import (
     DDIMScheduler,
     LMSDiscreteScheduler,
@@ -420,10 +421,6 @@ def to(self, device: str):
 
         return self
 
-    @property
-    def device(self) -> str:
-        return self._device.lower()
-
     @property
     def height(self) -> int:
         height = self.unet.model.inputs[0].get_partial_shape()[2]
@@ -629,21 +626,25 @@ def _compile(self):
             if (
                 "CACHE_DIR" not in self.ov_config.keys()
                 and not str(self._model_dir).startswith(gettempdir())
-                and "gpu" in self.device.lower()
+                and "GPU" in self._device
             ):
                 self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache")
 
-            logger.info(f"Compiling the {self._model_name} to {self.device} ...")
-            self.request = core.compile_model(self.model, self.device, self.ov_config)
+            logger.info(f"Compiling the {self._model_name} to {self._device} ...")
+            self.request = core.compile_model(self.model, self._device, self.ov_config)
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
-                logger.info(f"{self.device} SUPPORTED_PROPERTIES:")
+                logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
                 _print_compiled_model_properties(self.request)
 
     @property
-    def device(self):
+    def _device(self) -> str:
         return self.parent_model._device
 
+    @property
+    def device(self) -> torch.device:
+        return self.parent_model.device
+
 
 class OVModelTextEncoder(OVModelPart):
     def __init__(
@@ -715,7 +716,7 @@ def __call__(self, latent_sample: np.ndarray):
         return list(outputs.values())
 
     def _compile(self):
-        if "GPU" in self.device:
+        if "GPU" in self._device:
             self.ov_config.update({"INFERENCE_PRECISION_HINT": "f32"})
         super()._compile()
 
@@ -736,7 +737,7 @@ def __call__(self, sample: np.ndarray):
         return list(outputs.values())
 
     def _compile(self):
-        if "GPU" in self.device:
+        if "GPU" in self._device:
             self.ov_config.update({"INFERENCE_PRECISION_HINT": "f32"})
         super()._compile()
 
diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
index 6d72dc7b0e..629fc41855 100644
--- a/optimum/intel/openvino/modeling_seq2seq.py
+++ b/optimum/intel/openvino/modeling_seq2seq.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-import copy
 import logging
 import os
 from pathlib import Path
@@ -34,14 +33,18 @@
 )
 from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.generation import GenerationMixin
-from transformers.generation.logits_process import WhisperTimeStampLogitsProcessor
 from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
-from transformers.models.whisper.tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE
 
+from ..utils import is_transformers_version
 from .modeling_base_seq2seq import OVBaseModelForSeq2SeqLM
-from .utils import _print_compiled_model_properties
+from .utils import OV_TO_PT_TYPE, _print_compiled_model_properties
 
 
+if is_transformers_version(">=", "4.43.0"):
+    from transformers.cache_utils import EncoderDecoderCache
+else:
+    EncoderDecoderCache = dict
+
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
 
@@ -320,7 +323,7 @@ def __init__(
         super().__init__(
             encoder=encoder, decoder=decoder, decoder_with_past=decoder_with_past, config=config, **kwargs
         )
-        self.device = torch.device("cpu")
+
         self.decoder_with_past = None
         enable_compilation = kwargs.get("compile", True)
         self.encoder = OVEncoder(self.encoder_model, parent_model=self)
@@ -341,16 +344,16 @@ def __init__(
     def to(self, device: str):
         if isinstance(device, str):
             self._device = device.upper()
-            self.encoder._device = self._device
-            self.decoder._device = self._device
-            if self.use_cache:
-                self.decoder_with_past._device = self._device
             self.clear_requests()
         else:
             logger.debug(f"device must be of type {str} but got {type(device)} instead")
 
         return self
 
+    @property
+    def dtype(self) -> Optional[torch.dtype]:
+        return self.encoder.dtype or self.decoder.dtype
+
     @add_start_docstrings_to_model_forward(
         SEQ2SEQ_MODEL_DOCSTRING.format("batch_size, sequence_length")
         + TRANSLATION_EXAMPLE.format(
@@ -367,6 +370,7 @@ def forward(
         decoder_attention_mask: Optional[torch.LongTensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
         past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
         **kwargs,
     ) -> Seq2SeqLMOutput:
         # Encode if needed : first prediction pass
@@ -388,6 +392,7 @@ def forward(
                 encoder_hidden_states=encoder_outputs.last_hidden_state,
                 encoder_attention_mask=attention_mask,
                 decoder_attention_mask=decoder_attention_mask,
+                cache_position=cache_position,
             )
 
         return Seq2SeqLMOutput(logits=decoder_outputs.logits, past_key_values=decoder_outputs.past_key_values)
@@ -476,12 +481,34 @@ class OVEncoder:
     def __init__(self, model: openvino.runtime.Model, parent_model: OVModelForSeq2SeqLM):
         self.model = model
         self.parent_model = parent_model
-        self._device = self.parent_model._device
-        self.device = torch.device("cpu")
         self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
+        self.input_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.inputs}
+        self.output_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.outputs}
         self.main_input_name = self.parent_model.main_input_name or "input_ids"
         self.request = None
 
+    @property
+    def _device(self):
+        return self.parent_model._device
+
+    @property
+    def device(self):
+        return self.parent_model.device
+
+    @property
+    def dtype(self) -> Optional[torch.dtype]:
+        for dtype in self.input_dtypes.values():
+            torch_dtype = OV_TO_PT_TYPE.get(dtype)
+            if torch_dtype.is_floating_point:
+                return torch_dtype
+
+        for dtype in self.output_dtypes.values():
+            torch_dtype = OV_TO_PT_TYPE.get(dtype)
+            if torch_dtype.is_floating_point:
+                return torch_dtype
+
+        return None
+
     @add_start_docstrings_to_model_forward(ENCODER_INPUTS_DOCSTRING)
     def forward(
         self,
@@ -541,11 +568,11 @@ class OVDecoder:
     def __init__(self, model: openvino.runtime.Model, parent_model: OVModelForSeq2SeqLM):
         self.model = model
         self.parent_model = parent_model
-        self._device = self.parent_model._device
-        self.device = torch.device("cpu")
         self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
+        self.input_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.inputs}
         self.key_value_input_names = [key for key in self.input_names if "key_values" in key]
         self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}
+        self.output_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.outputs}
         self.key_value_output_names = [key for key in self.output_names if "key_values" in key or "present" in key]
         is_legacy = any("past_key_values" in key.get_any_name() for key in self.model.outputs)
 
@@ -558,6 +585,28 @@ def __init__(self, model: openvino.runtime.Model, parent_model: OVModelForSeq2Se
 
         self.request = None
 
+    @property
+    def _device(self) -> str:
+        return self.parent_model._device
+
+    @property
+    def device(self) -> torch.device:
+        return self.parent_model.device
+
+    @property
+    def dtype(self) -> Optional[torch.dtype]:
+        for dtype in self.input_dtypes.values():
+            torch_dtype = OV_TO_PT_TYPE.get(dtype)
+            if torch_dtype.is_floating_point:
+                return torch_dtype
+
+        for dtype in self.output_dtypes.values():
+            torch_dtype = OV_TO_PT_TYPE.get(dtype)
+            if torch_dtype.is_floating_point:
+                return torch_dtype
+
+        return None
+
     @add_start_docstrings_to_model_forward(DECODER_INPUTS_DOCSTRING)
     def forward(
         self,
@@ -566,6 +615,7 @@ def forward(
         encoder_attention_mask: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Seq2SeqLMOutput:
         self._compile()
         # Model inputs
@@ -592,6 +642,10 @@ def forward(
 
         if "decoder_attention_mask" in self.input_names and decoder_attention_mask is not None:
             inputs["decoder_attention_mask"] = decoder_attention_mask
+
+        if "cache_position" in self.input_names and cache_position is not None:
+            inputs["cache_position"] = cache_position
+
         # Run inference
         self.request.start_async(inputs, share_inputs=True)
         self.request.wait()
@@ -832,26 +886,29 @@ class OVModelForSpeechSeq2Seq(OVModelForSeq2SeqLM):
 
     def prepare_inputs_for_generation(
         self,
-        input_ids,
-        attention_mask: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        decoder_input_ids,
         past_key_values=None,
+        attention_mask=None,
         head_mask=None,
         decoder_head_mask=None,
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
+        decoder_attention_mask=None,
         **kwargs,
-    ) -> Dict:
-        if decoder_attention_mask is None:
-            decoder_attention_mask = torch.ones_like(input_ids).to(input_ids.device)
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        if decoder_attention_mask is None and decoder_input_ids is not None:
+            decoder_attention_mask = torch.ones_like(decoder_input_ids).to(decoder_input_ids.device)
 
         return {
-            "decoder_input_ids": input_ids,
-            "past_key_values": past_key_values,
             "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
             "attention_mask": attention_mask,
-            "decoder_attention_mask": decoder_attention_mask,
             "head_mask": head_mask,
             "decoder_head_mask": decoder_head_mask,
             "cross_attn_head_mask": cross_attn_head_mask,
@@ -874,6 +931,7 @@ def forward(
         decoder_attention_mask: Optional[torch.BoolTensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
         past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
         **kwargs,
     ) -> Seq2SeqLMOutput:
         return super().forward(
@@ -883,6 +941,7 @@ def forward(
             decoder_attention_mask=decoder_attention_mask,
             encoder_outputs=encoder_outputs,
             past_key_values=past_key_values,
+            cache_position=cache_position,
             **kwargs,
         )
 
@@ -899,13 +958,17 @@ def _from_pretrained(
             return super()._from_pretrained(model_id, config, **kwargs)
 
 
-class _OVModelForWhisper(OVModelForSpeechSeq2Seq):
+class _OVModelForWhisper(OVModelForSpeechSeq2Seq, WhisperForConditionalGeneration):
     """
     Whisper implements its own generate() method.
     """
 
     auto_model_class = WhisperForConditionalGeneration
 
+    # force the use of the WhisperForConditionalGeneration generate and prepare_inputs_for_generation methods
+    prepare_inputs_for_generation = WhisperForConditionalGeneration.prepare_inputs_for_generation
+    generate = WhisperForConditionalGeneration.generate
+
     @classmethod
     def _from_pretrained(
         cls,
@@ -915,415 +978,19 @@ def _from_pretrained(
     ):
         return super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(model_id, config, **kwargs)
 
-    # Adapted from transformers.models.whisper.modeling_whisper
-    def generate(
-        self,
-        input_features: Optional[torch.Tensor] = None,
-        generation_config=None,
-        logits_processor=None,
-        stopping_criteria=None,
-        prefix_allowed_tokens_fn=None,
-        synced_gpus=False,
-        return_timestamps=None,
-        task=None,
-        language=None,
-        is_multilingual=None,
-        prompt_ids: Optional[torch.Tensor] = None,
-        num_segment_frames: Optional[int] = None,
-        return_token_timestamps: Optional[bool] = None,
-        return_segments: bool = False,
-        attention_mask: Optional[torch.Tensor] = None,
-        time_precision: int = 0.02,
-        return_dict_in_generate: Optional[bool] = None,
-        **kwargs,
-    ):
-        if "inputs" in kwargs:
-            input_features = kwargs.pop("inputs")
-            logging.warn(
-                "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.",
-                FutureWarning,
-            )
-
-        return_dict_in_generate = (
-            return_dict_in_generate
-            if return_dict_in_generate is not None
-            else self.generation_config.return_dict_in_generate
-        )
-
-        if generation_config is None:
-            generation_config = copy.deepcopy(self.generation_config)
-
-        input_stride = (
-            1 * 2
-        )  # NOTE: replaced from `self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]`
-        if num_segment_frames is None:
-            num_segment_frames = input_stride * self.config.max_source_positions
-
-        # 1. Check whether we're in shortform or longform mode
-        if input_features is not None:
-            total_input_frames = input_features.shape[-1]
-        elif "encoder_outputs" in kwargs:
-            encoder_outputs_shape = (
-                kwargs["encoder_outputs"][0].shape
-                if isinstance(kwargs["encoder_outputs"], BaseModelOutput)
-                else kwargs["encoder_outputs"].shape
-            )
-            total_input_frames = encoder_outputs_shape[1] * input_stride
-        else:
-            raise ValueError("Make sure to provide either `input_features` or `encoder_outputs` to `generate`.")
-
-        is_shortform = total_input_frames <= num_segment_frames
-
-        # 2. Make sure the generation config is correctly set depending on whether timestamps are to be returned or not
-        if return_timestamps is True:
-            if not hasattr(generation_config, "no_timestamps_token_id"):
-                raise ValueError(
-                    "You are trying to return timestamps, but the generation config is not properly set. "
-                    "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. "
-                    "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
-                )
-            generation_config.return_timestamps = return_timestamps
-        elif not is_shortform:
-            if return_timestamps is False:
-                raise ValueError(
-                    "You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which "
-                    "requires the model to predict timestamp tokens. Please either pass `return_timestamps=True` or make sure to pass no more than 3000 mel input features."
-                )
-
-            if not hasattr(generation_config, "no_timestamps_token_id"):
-                raise ValueError(
-                    "You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which "
-                    "requires the generation config to have `no_timestamps_token_id` correctly. "
-                    "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. "
-                    "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
-                    "or make sure to pass no more than 3000 mel input features."
-                )
-
-            logger.info("Setting `return_timestamps=True` for long-form generation.")
-            generation_config.return_timestamps = True
-        else:
-            generation_config.return_timestamps = False
-
-        # 3. Make sure to correctly set language-related parameters
-        if is_multilingual is not None:
-            if not hasattr(generation_config, "is_multilingual"):
-                raise ValueError(
-                    "The generation config is outdated and is thus not compatible with the `is_multilingual` argument "
-                    "to `generate`. Please update the generation config as per the instructions "
-                    "https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224"
-                )
-            generation_config.is_multilingual = is_multilingual
-
-        if hasattr(generation_config, "is_multilingual") and not generation_config.is_multilingual:
-            if task is not None or language is not None:
-                raise ValueError(
-                    "Cannot specify `task` or `language` for an English-only model. If the model is intended to be "
-                    "multilingual, pass `is_multilingual=True` to generate, or update the generation config."
-                )
-
-        if language is not None:
-            if not hasattr(generation_config, "lang_to_id"):
-                raise ValueError(
-                    "The generation config is outdated and is thus not compatible with the `language` argument "
-                    "to `generate`. Either set the language using the `forced_decoder_ids` in the model config, "
-                    "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224"
-                )
-            language = language.lower()
-            generation_config.language = language
-        if task is not None:
-            if not hasattr(generation_config, "task_to_id"):
-                raise ValueError(
-                    "The generation config is outdated and is thus not compatible with the `task` argument "
-                    "to `generate`. Either set the task using the `forced_decoder_ids` in the model config, "
-                    "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224"
-                )
-            generation_config.task = task
-
-        # 4. Add forced decoder ids depending on passed `language`, `task`,`prompt_ids`, `return_token_timestamps` and `return_timestamps`
-        forced_decoder_ids = None
-        # Legacy code for backward compatibility
-        if hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids is not None:
-            forced_decoder_ids = self.config.forced_decoder_ids
-        elif (
-            hasattr(self.generation_config, "forced_decoder_ids")
-            and self.generation_config.forced_decoder_ids is not None
-        ):
-            forced_decoder_ids = self.generation_config.forced_decoder_ids
-        else:
-            forced_decoder_ids = kwargs.get("forced_decoder_ids", None)
-
-        if task is not None or language is not None or (forced_decoder_ids is None and prompt_ids is not None):
-            forced_decoder_ids = []
-            if hasattr(generation_config, "language"):
-                if generation_config.language in generation_config.lang_to_id.keys():
-                    language_token = generation_config.language
-                elif generation_config.language in TO_LANGUAGE_CODE.keys():
-                    language_token = f"<|{TO_LANGUAGE_CODE[generation_config.language]}|>"
-                elif generation_config.language in TO_LANGUAGE_CODE.values():
-                    language_token = f"<|{generation_config.language}|>"
-                else:
-                    is_language_code = len(generation_config.language) == 2
-                    raise ValueError(
-                        f"Unsupported language: {generation_config.language}. Language should be one of:"
-                        f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
-                    )
-                forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
-            else:
-                forced_decoder_ids.append((1, None))  # automatically detect the language
-
-            if hasattr(generation_config, "task"):
-                if generation_config.task in TASK_IDS:
-                    forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task]))
-                else:
-                    raise ValueError(
-                        f"The `{generation_config.task}`task is not supported. The task should be one of `{TASK_IDS}`"
-                    )
-            elif hasattr(generation_config, "task_to_id"):
-                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))  # defaults to transcribe
-            if hasattr(generation_config, "no_timestamps_token_id") and not generation_config.return_timestamps:
-                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
-                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
-
-        if forced_decoder_ids is not None:
-            generation_config.forced_decoder_ids = forced_decoder_ids
-
-        if prompt_ids is not None:
-            if kwargs.get("decoder_start_token_id") is not None:
-                raise ValueError(
-                    "When specifying `prompt_ids`, you cannot also specify `decoder_start_token_id` as it gets overwritten."
-                )
-            prompt_ids = prompt_ids.tolist()
-            decoder_start_token_id, *text_prompt_ids = prompt_ids
-            # Slicing the text prompt ids in a manner consistent with the OpenAI implementation
-            # to accomodate context space for the prefix (see https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/decoding.py#L599)
-            text_prompt_ids = text_prompt_ids[-self.config.max_target_positions // 2 - 1 :]
-            # Set the decoder_start_token_id to <|startofprev|>
-            kwargs.update({"decoder_start_token_id": decoder_start_token_id})
-
-            # If the user passes `max_new_tokens`, increase its number to account for the prompt
-            if kwargs.get("max_new_tokens", None) is not None:
-                kwargs["max_new_tokens"] += len(text_prompt_ids)
-                if kwargs["max_new_tokens"] >= self.config.max_target_positions:
-                    raise ValueError(
-                        f"The length of the sliced `prompt_ids` is {len(text_prompt_ids)}, and the `max_new_tokens` "
-                        f"{kwargs['max_new_tokens'] - len(text_prompt_ids)}. Thus, the combined length of the sliced "
-                        f"`prompt_ids` and `max_new_tokens` is: {kwargs['max_new_tokens']}. This exceeds the "
-                        f"`max_target_positions` of the Whisper model: {self.config.max_target_positions}. "
-                        "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
-                        f"so that their combined length is less that {self.config.max_target_positions}."
-                    )
-
-            # Reformat the forced_decoder_ids to incorporate the prompt
-            non_prompt_forced_decoder_ids = (
-                kwargs.pop("forced_decoder_ids", None) or generation_config.forced_decoder_ids
-            )
-            forced_decoder_ids = [
-                *text_prompt_ids,
-                generation_config.decoder_start_token_id,
-                *[token for _rank, token in non_prompt_forced_decoder_ids],
-            ]
-            forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_decoder_ids)]
-            generation_config.forced_decoder_ids = forced_decoder_ids
-
-        if return_token_timestamps:
-            kwargs["output_attentions"] = True
-            return_dict_in_generate = True
-
-            if getattr(generation_config, "task", None) == "translate":
-                logger.warning("Token-level timestamps may not be reliable for task 'translate'.")
-            if not hasattr(generation_config, "alignment_heads"):
-                raise ValueError(
-                    "Model generation config has no `alignment_heads`, token-level timestamps not available. "
-                    "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config."
-                )
-
-            if kwargs.get("num_frames") is not None:
-                generation_config.num_frames = kwargs.pop("num_frames")
-
-        if generation_config.return_timestamps is True:
-            last_forced_decoder_ids = (
-                generation_config.forced_decoder_ids[-1][-1]
-                if hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids
-                else None
-            )
-            if last_forced_decoder_ids == self.generation_config.no_timestamps_token_id:
-                # remove no_timestamp to be forcefully generated if we want to return timestamps
-                # this is also important to make sure `WhisperTimeStampLogitsProcessor` functions correctly
-                forced_decoder_ids = generation_config.forced_decoder_ids[:-1]
-                # Make sure that if list is empty we set it to None
-                generation_config.forced_decoder_ids = None if len(forced_decoder_ids) == 0 else forced_decoder_ids
-
-            timestamp_processor = [WhisperTimeStampLogitsProcessor(generation_config)]
-            logits_processor = (
-                timestamp_processor if logits_processor is None else timestamp_processor + logits_processor
-            )
-
-        # 5. If we're in shortform mode, simple generate the whole input at once and return the output
-        if is_shortform:
-            outputs = super().generate(
-                input_features,
-                generation_config,
-                logits_processor,
-                stopping_criteria,
-                prefix_allowed_tokens_fn,
-                synced_gpus,
-                return_dict_in_generate=return_dict_in_generate,
-                **kwargs,
-            )
-
-            if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
-                num_frames = getattr(generation_config, "num_frames", None)
-                outputs["token_timestamps"] = self._extract_token_timestamps(
-                    outputs, generation_config.alignment_heads, num_frames=num_frames
-                )
-
-            return outputs
-
-        # 6. Else we're in longform mode which is more complex. We need to chunk the audio input depending on when the model generated
-        # timestamp tokens
-        # 6.1 Set running parameters for while loop
-        if not return_segments and return_dict_in_generate:
-            raise ValueError(
-                "Make sure to set `return_segments=True` to return generation outputs as part of the `'segments' key.`"
-            )
-
-        # if input is longer than 30 seconds we default to long-form generation
-        timestamp_begin = self.generation_config.no_timestamps_token_id + 1
-        # input stride is mel frames per encoder output vector which is the product of all conv strides
-        batch_size = input_features.shape[0]
-
-        if batch_size > 1 and attention_mask is None:
-            raise ValueError(
-                "When doing long-form audio transcription, make sure to pass an `attention_mask`. You can retrieve the `attention_mask` by doing `processor(audio, ..., return_attention_mask=True)` "
-            )
-        elif batch_size > 1:
-            max_frames = attention_mask.sum(-1).cpu().to(torch.long)
-            seek = torch.zeros((batch_size,), dtype=torch.long)
-        else:
-            max_frames = torch.ones((1,), dtype=torch.long) * total_input_frames
-            seek = torch.zeros((1,), dtype=torch.long)
-
-        current_segments = [[] for _ in range(batch_size)]
-        cur_to_prev_index_map = list(range(batch_size))
-
-        # batch size can decrease during the run
-        cur_bsz = prev_bsz = batch_size
-
-        # 6.2 Transcribe audio until we reach the end of all input audios
-        while (seek < max_frames).any():
-            prev_bsz = cur_bsz
-
-            # 6.3 NOTE: When in longform transcription mode and batch size > 1 we need to dynamically reduce the batch size during the loop
-            # in case one audio finished earlier than another one. Thus, we need to keep a table of "previous-index-2-current-index" in order
-            # to know which original audio is being decoded
-            new_cur_to_prev_index_map = []
-            for i in range(prev_bsz):
-                prev_i = cur_to_prev_index_map[i]
-                if seek[prev_i] >= max_frames[prev_i]:
-                    cut_index = i + (cur_bsz - prev_bsz)
-                    cur_bsz -= 1
-                    input_features = torch.cat([input_features[:cut_index], input_features[cut_index + 1 :]], dim=0)
-                else:
-                    # cut out index that goes away
-                    new_cur_to_prev_index_map.append(prev_i)
-
-            # 6.4  Set updated index map, duration of previously decoded chunks and number of max frames of current decoding chunk
-            cur_to_prev_index_map = new_cur_to_prev_index_map
-            time_offset = seek * time_precision / input_stride
-            seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames)
-
-            # 6.5 Make sure that all inputs are padded to the same input length
-            segment_input = []
-            for i in range(cur_bsz):
-                prev_i = cur_to_prev_index_map[i]
-                segment_input_slice = input_features[
-                    i : i + 1, :, seek[prev_i] : seek[prev_i] + seek_num_frames[prev_i]
-                ]
-
-                if segment_input_slice.shape[-1] < num_segment_frames:
-                    # pad to 3000 if necessary
-                    segment_input_slice = torch.nn.functional.pad(
-                        segment_input_slice, pad=(0, num_segment_frames - segment_input_slice.shape[-1])
-                    )
-
-                segment_input.append(segment_input_slice)
-
-            segment_input = torch.cat(segment_input, dim=0)
-
-            # 6.6 Batch generate current chunk
-            seek_outputs = super().generate(
-                segment_input,
-                generation_config,
-                logits_processor,
-                stopping_criteria,
-                prefix_allowed_tokens_fn,
-                synced_gpus,
-                return_dict_in_generate=return_dict_in_generate,
-                **kwargs,
-            )
-
-            if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
-                num_frames = getattr(generation_config, "num_frames", None)
-                seek_outputs["token_timestamps"] = self._extract_token_timestamps(
-                    seek_outputs, generation_config.alignment_heads, num_frames=num_frames
-                )
-
-            if return_dict_in_generate:
-                seek_sequences = seek_outputs["sequences"]
-                seek_outputs = [
-                    {k: v[i] for k, v in seek_outputs.items()}
-                    for i in range(next(iter(seek_outputs.values())).size(0))
-                ]
-            else:
-                seek_sequences = seek_outputs
-
-            # 6.7 Loop over each decoded audio individually as each decoding can be of a different length
-            for i, seek_sequence in enumerate(seek_sequences):
-                prev_i = cur_to_prev_index_map[i]
-
-                # make sure we cut a predicted EOS token if we are not finished with the generation yet
-                is_not_final = (seek[prev_i] + num_segment_frames) < max_frames[prev_i]
-                if is_not_final and seek_sequence[-1] == self.generation_config.eos_token_id:
-                    seek_sequence = seek_sequence[:-1]
-
-                # remove all padding tokens
-                if seek_sequence[-1] == self.generation_config.pad_token_id:
-                    num_paddings = (seek_sequence == self.generation_config.pad_token_id).sum()
-                    seek_sequence = seek_sequence[:-num_paddings]
-
-                segments, segment_offset = self._retrieve_segment(
-                    seek_sequence=seek_sequence,
-                    seek_outputs=seek_outputs,
-                    time_offset=time_offset,
-                    timestamp_begin=timestamp_begin,
-                    seek_num_frames=seek_num_frames,
-                    cur_bsz=cur_bsz,
-                    time_precision=time_precision,
-                    input_stride=input_stride,
-                    prev_idx=prev_i,
-                    idx=i,
-                )
-
-                current_segments[prev_i] += segments
-                seek[prev_i] += segment_offset
-
-        # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted
-        # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output
-        sequences = []
-        max_total_length = 0
-        for current_segment_list in current_segments:
-            sequences.append(torch.cat([d["tokens"] for d in current_segment_list], dim=-1))
-            max_total_length = max(max_total_length, len(sequences[-1]))
-
-        for i in range(batch_size):
-            sequences[i] = torch.nn.functional.pad(
-                sequences[i], pad=(0, max_total_length - len(sequences[i])), value=self.generation_config.pad_token_id
-            )
+    class DummyWhisperModel:
+        def __init__(self):
+            self.encoder = self.Encoder()
 
-        sequences = torch.stack(sequences, dim=0)
+        class Encoder:
+            def __init__(self):
+                self.conv1 = self.Conv(stride=(1,))
+                self.conv2 = self.Conv(stride=(2,))
 
-        # 8. If we return all segments, the predicted output sequences are put under `"sequences"`.
-        if return_segments:
-            return {"sequences": sequences, "segments": current_segments}
+            class Conv:
+                def __init__(self, stride):
+                    self.stride = stride
 
-        return sequences
+    # a dummy model attribute that's used in the generate method to compute the input stride
+    # input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]
+    model = DummyWhisperModel()
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
index 69a750fb65..06ad451237 100644
--- a/optimum/intel/openvino/utils.py
+++ b/optimum/intel/openvino/utils.py
@@ -21,6 +21,7 @@
 from typing import Tuple, Union
 
 import numpy as np
+import torch
 from huggingface_hub import model_info
 from openvino.runtime import Core, Type, properties
 from transformers import AutoTokenizer, CLIPTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -52,7 +53,6 @@
 TEXTUAL_INVERSION_NAME_SAFE = "learned_embeds.safetensors"
 TEXTUAL_INVERSION_EMBEDDING_KEY = "text_model.embeddings.token_embedding.weight"
 
-
 OV_TO_NP_TYPE = {
     "boolean": np.bool_,
     "i8": np.int8,
@@ -68,6 +68,21 @@
     "f64": np.float64,
 }
 
+OV_TO_PT_TYPE = {
+    "boolean": torch.bool,
+    "i8": torch.int8,
+    "u8": torch.uint8,
+    "i16": torch.int16,
+    "u16": torch.uint16,
+    "i32": torch.int32,
+    "u32": torch.uint32,
+    "i64": torch.int64,
+    "u64": torch.uint64,
+    "f16": torch.float16,
+    "f32": torch.float32,
+    "f64": torch.float64,
+}
+
 
 STR_TO_OV_TYPE = {
     "boolean": Type.boolean,
diff --git a/setup.py b/setup.py
index 8abcbfaf08..853d012e54 100644
--- a/setup.py
+++ b/setup.py
@@ -28,8 +28,8 @@
 
 INSTALL_REQUIRE = [
     "torch>=1.11",
-    "transformers>=4.36.0,<4.43.0",
-    "optimum>=1.21.2,<1.22.0",
+    "transformers>=4.36.0,<4.44.0",
+    "optimum@git+https://github.com/huggingface/optimum.git@v1.21.3-release",
     "datasets>=1.4.0",
     "sentencepiece",
     "setuptools",
@@ -59,10 +59,10 @@
 QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"]
 
 EXTRAS_REQUIRE = {
-    "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"],
+    "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate", "transformers<4.43.0"],
     "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"],
     "nncf": ["nncf>=2.11.0"],
-    "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<=4.41.2"],
+    "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<4.44.0"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,
diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
index b6635ca154..8e56dd3957 100644
--- a/tests/ipex/test_modeling.py
+++ b/tests/ipex/test_modeling.py
@@ -103,8 +103,8 @@ def test_compare_to_transformers(self, model_arch):
         for output_name in {"logits", "last_hidden_state"}:
             if output_name in transformers_outputs:
                 self.assertTrue(torch.allclose(outputs[output_name], transformers_outputs[output_name], atol=1e-4))
-                self.assertTrue(torch.equal(outputs[output_name], loaded_model_outputs[output_name]))
-                self.assertTrue(torch.equal(outputs[output_name], init_model_outputs[output_name]))
+                self.assertTrue(torch.allclose(outputs[output_name], loaded_model_outputs[output_name]))
+                self.assertTrue(torch.allclose(outputs[output_name], init_model_outputs[output_name]))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_pipeline(self, model_arch):
@@ -259,7 +259,7 @@ def test_pipeline(self, model_arch):
         model.config.encoder_no_repeat_ngram_size = 0
         model.to("cpu")
         pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
-        outputs = pipe("This is a sample", max_length=10)
+        outputs = pipe("This is a sample", max_new_tokens=10)
         self.assertEqual(pipe.device, model.device)
         self.assertTrue(all("This is a sample" in item["generated_text"] for item in outputs))
 
@@ -330,7 +330,7 @@ def test_compare_with_and_without_past_key_values(self):
         model_with_pkv.generate(**tokens)
         with Timer() as with_pkv_timer:
             outputs_model_with_pkv = model_with_pkv.generate(
-                **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
+                **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1
             )
         model_without_pkv = IPEXModelForCausalLM.from_pretrained(
             model_id, use_cache=False, subfolder="model_without_pkv"
@@ -339,16 +339,11 @@ def test_compare_with_and_without_past_key_values(self):
         model_without_pkv.generate(**tokens)
         with Timer() as without_pkv_timer:
             outputs_model_without_pkv = model_without_pkv.generate(
-                **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
+                **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1
             )
         self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
-        self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
-        self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)
-        # self.assertTrue(
-        #     without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE,
-        #     f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms,"
-        #     f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}",
-        # )
+        self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH + tokens.input_ids.shape[1])
+        self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH + tokens.input_ids.shape[1])
 
 
 class IPEXModelForAudioClassificationTest(unittest.TestCase):
diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py
index c4ae471a0f..767097a5dd 100644
--- a/tests/ipex/test_pipelines.py
+++ b/tests/ipex/test_pipelines.py
@@ -137,9 +137,9 @@ def test_text_generation_pipeline_inference(self, model_arch):
         ipex_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex")
         inputs = "Describe a real-world application of AI."
         with torch.inference_mode():
-            transformers_output = transformers_generator(inputs)
+            transformers_output = transformers_generator(inputs, max_new_tokens=10)
         with torch.inference_mode():
-            ipex_output = ipex_generator(inputs)
+            ipex_output = ipex_generator(inputs, max_new_tokens=10)
         self.assertTrue(isinstance(ipex_generator.model, IPEXModelForCausalLM))
         self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
         self.assertEqual(transformers_output[0]["generated_text"], ipex_output[0]["generated_text"])
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 839dd55add..79c3920e08 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -241,6 +241,7 @@ def test_exporters_cli_int4_with_local_model_and_default_config(self):
             # overload for matching with default configuration
             pt_model.config._name_or_path = "tiiuae/falcon-7b-instruct"
             pt_model.save_pretrained(tmpdir)
+
             subprocess.run(
                 f"optimum-cli export openvino --model {tmpdir} --task text-generation-with-past --weight-format int4 {tmpdir}",
                 shell=True,
@@ -251,16 +252,23 @@ def test_exporters_cli_int4_with_local_model_and_default_config(self):
             rt_info = model.model.get_rt_info()
             self.assertTrue("nncf" in rt_info)
             self.assertTrue("weight_compression" in rt_info["nncf"])
-            default_config = _DEFAULT_4BIT_CONFIGS["tiiuae/falcon-7b-instruct"]
             model_weight_compression_config = rt_info["nncf"]["weight_compression"]
-            sym = default_config.pop("sym", False)
+
+            default_config = _DEFAULT_4BIT_CONFIGS["tiiuae/falcon-7b-instruct"]
             bits = default_config.pop("bits", None)
             self.assertEqual(bits, 4)
 
-            mode = f'int{bits}_{"sym" if sym else "asym"}'
-            default_config["mode"] = mode
+            sym = default_config.pop("sym", False)
+            default_config["mode"] = f'int{bits}_{"sym" if sym else "asym"}'
+
+            quant_method = default_config.pop("quant_method", None)
+            default_config["awq"] = quant_method == "awq"
+            default_config["gptq"] = quant_method == "gptq"
+
+            default_config.pop("dataset", None)
+
             for key, value in default_config.items():
-                self.assertTrue(key in model_weight_compression_config)
+                self.assertIn(key, model_weight_compression_config)
                 self.assertEqual(
                     model_weight_compression_config[key].value,
                     str(value),
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 26071be2eb..3eb7feaecc 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -941,7 +941,7 @@ def test_beam_search(self, model_arch):
             eos_token_id=None,
         )
 
-        if model_arch == "minicpm":
+        if model_arch in ["minicpm", "internlm2"]:
             beam_sample_gen_config.top_k = 1
 
         group_beam_search_gen_config = GenerationConfig(
@@ -970,12 +970,15 @@ def test_beam_search(self, model_arch):
             group_beam_search_gen_config,
             constrained_beam_search_gen_config,
         ]
+        set_seed(SEED)
         ov_model_stateful = OVModelForCausalLM.from_pretrained(
             model_id, export=True, use_cache=True, stateful=True, **model_kwargs
         )
+        set_seed(SEED)
         ov_model_stateless = OVModelForCausalLM.from_pretrained(
             model_id, export=True, use_cache=True, stateful=False, **model_kwargs
         )
+        set_seed(SEED)
         transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
 
         if model_arch == "arctic":
@@ -1662,15 +1665,15 @@ def _generate_random_audio_data(self):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)
+        model_id = MODEL_NAMES[model_arch]
+        transformers_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
         ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
         self.assertIsInstance(ov_model.config, PretrainedConfig)
-        transformers_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
+
         processor = get_preprocessor(model_id)
         data = self._generate_random_audio_data()
         features = processor.feature_extractor(data, return_tensors="pt")
-
         decoder_start_token_id = transformers_model.config.decoder_start_token_id
         decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id}
 
@@ -1699,7 +1702,7 @@ def test_pipeline(self, model_arch):
         set_seed(SEED)
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True)
-        model.eval()
+
         processor = get_preprocessor(model_id)
         pipe = pipeline(
             "automatic-speech-recognition",
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index ac158b69bf..d5f37c5f17 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -897,12 +897,16 @@ def test_calibration_data_uniqueness(self, model_id, apply_caching):
         )
         for _ in range(2):
             input_features = self._generate_random_audio_data(processor)
-            ov_model.generate(input_features)
+            ov_model.generate(input_features, max_new_tokens=10, min_new_tokens=10)
 
         data_hashes_per_key = defaultdict(list)
         data_id_per_key = defaultdict(set)
+
         for inputs_dict in calibration_data:
             for k, v in inputs_dict.items():
+                if k == "input_ids":
+                    continue
+
                 x = (v.numpy() if isinstance(v, torch.Tensor) else v).copy()
                 data_hashes_per_key[k].append(hash(x.tobytes()))
                 data_id_per_key[k].add(id(v))
@@ -911,7 +915,7 @@ def test_calibration_data_uniqueness(self, model_id, apply_caching):
             self.assertTrue(any(data_hashes[0] != it for it in data_hashes))
         if apply_caching:
             # With caching, encoder hidden states tensors should be cached, resulting in only 2 tensors stored
-            self.assertTrue(len(data_id_per_key["encoder_hidden_states"]) == 2)
+            self.assertEqual(len(data_id_per_key["encoder_hidden_states"]), 2)
         else:
             # Without caching, encoder hidden states tensors will be unique for each collected input
-            self.assertTrue(len(data_id_per_key["encoder_hidden_states"]) > 2)
+            self.assertGreater(len(data_id_per_key["encoder_hidden_states"]), 2)
diff --git a/tests/openvino/test_stable_diffusion.py b/tests/openvino/test_stable_diffusion.py
index e735a07fb4..cb68c75908 100644
--- a/tests/openvino/test_stable_diffusion.py
+++ b/tests/openvino/test_stable_diffusion.py
@@ -248,7 +248,7 @@ def test_compare_to_diffusers(self, model_arch: str):
             self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4))
 
         # Compare model devices
-        self.assertEqual(pipeline.device.type, ov_pipeline.device)
+        self.assertEqual(pipeline.device, ov_pipeline.device)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_image_reproducibility(self, model_arch: str):
@@ -406,7 +406,7 @@ def test_compare_to_diffusers(self, model_arch: str):
             # Compare model outputs
             self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4))
         # Compare model devices
-        self.assertEqual(pipeline.device.type, ov_pipeline.device)
+        self.assertEqual(pipeline.device, ov_pipeline.device)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_image_reproducibility(self, model_arch: str):
@@ -536,7 +536,7 @@ def test_compare_to_diffusers(self, model_arch: str):
             # Compare model outputs
             self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4))
         # Compare model devices
-        self.assertEqual(pipeline.device.type, ov_pipeline.device)
+        self.assertEqual(pipeline.device, ov_pipeline.device)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @pytest.mark.run_slow