From 71ae2daa6a20101d304e87523a6052fe3d0025db Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:43:04 +0200 Subject: [PATCH] Support Transformers 4.43 (#856) * install from pr * updates * fix * update TRANSFORMERS_MAX_VERSION * fix sdpa in training * fix whisper * fix * whisper calibration checks * fix OVTrainerTextClassificationTrainingTest's expected fake quantize * fix OVCLIExportTestCase's expected_int4 * update min ci transformers version to 4.37 * fix OVQuantizerTest's expected fake quantize * reorder_cache * fix expected compressed matmuls * fix test_exporters_cli_int4_with_local_model_and_default_config * fix qwen custom modeling test * fix failing ipex tests * fix ipex * fix the last ipex failing test_compare_with_and_without_past_key_values * use minimal prepare_inputs_for_generation in OVModelForSpeechSeq2Seq * keeping compatibility with transformers 4.36 * keep support of whisper using WhisperGenerationMixin.generate a,d dummy model fix * trigger * fix * device property * standardize .device and ._device attributes/properties * fix * fix * revert Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * use falcon * torch.device property always cpu * style * resolve conflicts * decoder_attention_mask for older versions * optimum main * limit inc transformers version * fix pipeline missing dtype * add dtype for seq to seq models * pass phi beam search test and skip internlm2 * fix for internlm2 --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- .github/workflows/test_ipex.yml | 42 +- .github/workflows/test_openvino.yml | 57 +- .github/workflows/test_openvino_basic.yml | 64 ++- optimum/exporters/ipex/model_patcher.py | 2 +- optimum/intel/ipex/modeling_base.py | 6 +- optimum/intel/openvino/modeling.py | 1 - optimum/intel/openvino/modeling_base.py | 35 +- .../intel/openvino/modeling_base_seq2seq.py | 2 + optimum/intel/openvino/modeling_diffusion.py | 23 +- optimum/intel/openvino/modeling_seq2seq.py | 531 ++++-------------- optimum/intel/openvino/utils.py | 17 +- setup.py | 8 +- tests/ipex/test_modeling.py | 19 +- tests/ipex/test_pipelines.py | 4 +- tests/openvino/test_exporters_cli.py | 18 +- tests/openvino/test_modeling.py | 13 +- tests/openvino/test_quantization.py | 10 +- tests/openvino/test_stable_diffusion.py | 6 +- 18 files changed, 300 insertions(+), 558 deletions(-) diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index 96ef047aaf..8b97bdd535 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -17,26 +17,32 @@ concurrency: jobs: build: + runs-on: ubuntu-latest strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - transformers-version: [4.39.0, 4.41.2] - os: [ubuntu-latest] + python-version: [3.9] + transformers-version: ["4.39.0", "4.43.*"] + ipex-version: ["2.2.0", "2.3.*"] + include: + - python-version: 3.8 + transformers-version: 4.39.0 + ipex-version: 2.2.0 - runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install torch torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu - pip install .[ipex,tests] - pip install transformers==${{ matrix.transformers-version }} - - name: Test with Pytest - run: | - pytest tests/ipex/ + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install torch==${{ matrix.ipex-version }} --extra-index-url https://download.pytorch.org/whl/cpu + pip install intel_extension_for_pytorch==${{ matrix.ipex-version }} + pip install Pillow parameterized + pip install transformers[testing]==${{ matrix.transformers-version }} + pip install .[ipex] + - name: Test with Pytest + run: | + pytest tests/ipex/ diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 6f9675cde7..13a6b83e57 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -21,36 +21,37 @@ jobs: fail-fast: false matrix: python-version: ["3.8", "3.12"] - transformers-version: ["4.36.0", "4.42.*"] + transformers-version: ["4.36.0", "4.43.*"] os: [ubuntu-latest] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v4 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - # install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install transformers==${{ matrix.transformers-version }} - pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime - - name: Test with Pytest - env: - HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} - run: | - pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0 - - name: Test basic - run: | - pip uninstall -y nncf - pytest tests/openvino/test_modeling_basic.py - - name: Test openvino-nightly - run: | - pip uninstall -y openvino - pip install openvino-nightly - python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)" - optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov + - uses: actions/checkout@v4 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + # install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime + pip install transformers==${{ matrix.transformers-version }} + + - name: Test with Pytest + env: + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + run: | + pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0 + - name: Test basic + run: | + pip uninstall -y nncf + pytest tests/openvino/test_modeling_basic.py + - name: Test openvino-nightly + run: | + pip uninstall -y openvino + pip install openvino-nightly + python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)" + optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml index 141b94425e..28c8369c75 100644 --- a/.github/workflows/test_openvino_basic.yml +++ b/.github/workflows/test_openvino_basic.yml @@ -3,7 +3,7 @@ name: OpenVINO - Basic Test on: workflow_dispatch: schedule: - - cron: '41 1 * * *' # run every day at 1:41 + - cron: "41 1 * * *" # run every day at 1:41 push: branches: - v*-release @@ -23,36 +23,42 @@ jobs: # Testing lower and upper bound of supported Python versions # This also ensures that the test fails if dependencies break for Python 3.7 python-version: ["3.8", "3.12"] - optimum: ['optimum', 'git+https://github.com/huggingface/optimum.git'] os: ["ubuntu-22.04", "windows-latest"] + transformers-version: ["4.43.*"] + include: + - python-version: "3.12" + os: "ubuntu-22.04" + transformers-version: "4.36.0" runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v4 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - # Install openvino manually to prevent dependency conflicts when .[openvino] pins - # optimum or transformers to a specific version - # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install .[tests] openvino ${{ matrix.optimum}} - - - name: Pip freeze - run: pip freeze - - - name: Test with Pytest - run: | - pytest tests/openvino/test_modeling_basic.py - - - name: Slow tests - run: | - pip install nncf - pytest tests/openvino -s -m "run_slow" --durations=0 - env: - RUN_SLOW: 1 + - uses: actions/checkout@v4 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + # Install openvino manually to prevent dependency conflicts when .[openvino] pins + # optimum or transformers to a specific version + pip install .[tests] openvino + pip install transformers==${{ matrix.transformers-version }} + + - name: Pip freeze + run: pip freeze + + - name: Test with Pytest + run: | + pytest tests/openvino/test_modeling_basic.py + + - name: Slow tests + run: | + pip install nncf + pytest tests/openvino -s -m "run_slow" --durations=0 + env: + RUN_SLOW: 1 diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index d1c6668b7e..9c72a0eeda 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -31,7 +31,7 @@ # Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version _TRANSFORMERS_MIN_VERSION = "4.39.0" -_TRANSFORMERS_MAX_VERSION = "4.41.2" +_TRANSFORMERS_MAX_VERSION = "4.43.99" _IPEX_EXPORTED_ARCH = ("LlamaForCausalLM",) _IPEX_EXPORTED_TASK = ("text-generation",) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index aa57ec20fa..f42cd8908a 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -470,9 +470,11 @@ def __init__( self._reorder_cache = _ipex_reorder_cache else: # Check if _reorder_cache is a static method - if isinstance(self.model_cls.__dict__["_reorder_cache"], staticmethod): + if "_reorder_cache" in self.model_cls.__dict__ and isinstance( + self.model_cls.__dict__["_reorder_cache"], staticmethod + ): self._reorder_cache = self.model_cls._reorder_cache - else: + elif "_reorder_cache" in self.model_cls.__dict__: self._reorder_cache = self.model_cls._reorder_cache.__get__(self) if is_transformers_version(">=", "4.38.0") and model_type in {"llama", "phi", "persimmon"}: diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py index 5e0fe61f8a..ba2d264346 100644 --- a/optimum/intel/openvino/modeling.py +++ b/optimum/intel/openvino/modeling.py @@ -129,7 +129,6 @@ def __init__(self, model: openvino.runtime.Model, config: transformers.Pretraine # Avoid warnings when creating a transformers pipeline AutoConfig.register(self.base_model_prefix, AutoConfig) self.auto_model_class.register(AutoConfig, self.__class__) - self.device = torch.device("cpu") def to(self, device: str): """ diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index b90e6285a9..90c43b7805 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -20,6 +20,7 @@ from typing import Dict, Optional, Union import openvino +import torch from huggingface_hub import hf_hub_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from openvino import Core, convert_model @@ -34,7 +35,7 @@ from ...exporters.openvino import export, main_export from ..utils.import_utils import is_nncf_available from .configuration import OVConfig, OVDynamicQuantizationConfig, OVWeightQuantizationConfig -from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, _print_compiled_model_properties +from .utils import ONNX_WEIGHTS_NAME, OV_TO_PT_TYPE, OV_XML_FILE_NAME, _print_compiled_model_properties core = Core() @@ -77,16 +78,27 @@ def __init__( model = self._reshape(model, -1, -1, height, width) input_names = {} + input_dtypes = {} for idx, key in enumerate(model.inputs): names = tuple(key.get_names()) input_names[next((name for name in names if "/" not in name), names[0])] = idx + input_dtypes[ + next((name for name in names if "/" not in name), names[0]) + ] = key.get_element_type().get_type_name() self.input_names = input_names + self.input_dtypes = input_dtypes output_names = {} + output_dtypes = {} for idx, key in enumerate(model.outputs): names = tuple(key.get_names()) output_names[next((name for name in names if "/" not in name), names[0])] = idx + output_dtypes[ + next((name for name in names if "/" not in name), names[0]) + ] = key.get_element_type().get_type_name() + self.output_names = output_names + self.output_dtypes = output_dtypes self.model = model self.request = None @@ -103,6 +115,27 @@ def __init__( if enable_compilation: self.compile() + @property + def device(self) -> torch.device: + """ + `torch.device`: The device on which the module is (for torch compatibility). + """ + return torch.device("cpu") + + @property + def dtype(self) -> Optional[torch.dtype]: + for dtype in self.input_dtypes.values(): + torch_dtype = OV_TO_PT_TYPE.get(dtype) + if torch_dtype.is_floating_point: + return torch_dtype + + for dtype in self.output_dtypes.values(): + torch_dtype = OV_TO_PT_TYPE.get(dtype) + if torch_dtype.is_floating_point: + return torch_dtype + + return None + @staticmethod def load_model( file_name: Union[str, Path], diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 718b2f874e..95ffbc930a 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -350,6 +350,8 @@ def _reshape(self, model: openvino.runtime.Model, batch_size: int, sequence_leng shapes[inputs][0] = batch_size if not is_decoder else -1 if inputs.get_any_name().startswith("past_key_values"): shapes[inputs][2] = -1 + elif inputs.get_any_name().startswith("cache_position"): + shapes[inputs][0] = sequence_length elif is_decoder and not inputs.get_any_name().startswith("encoder"): shapes[inputs][1] = -1 else: diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 1b880e736c..e58f6156bb 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -25,6 +25,7 @@ import numpy as np import openvino import PIL +import torch from diffusers import ( DDIMScheduler, LMSDiscreteScheduler, @@ -420,10 +421,6 @@ def to(self, device: str): return self - @property - def device(self) -> str: - return self._device.lower() - @property def height(self) -> int: height = self.unet.model.inputs[0].get_partial_shape()[2] @@ -629,21 +626,25 @@ def _compile(self): if ( "CACHE_DIR" not in self.ov_config.keys() and not str(self._model_dir).startswith(gettempdir()) - and "gpu" in self.device.lower() + and "GPU" in self._device ): self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache") - logger.info(f"Compiling the {self._model_name} to {self.device} ...") - self.request = core.compile_model(self.model, self.device, self.ov_config) + logger.info(f"Compiling the {self._model_name} to {self._device} ...") + self.request = core.compile_model(self.model, self._device, self.ov_config) # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2: - logger.info(f"{self.device} SUPPORTED_PROPERTIES:") + logger.info(f"{self._device} SUPPORTED_PROPERTIES:") _print_compiled_model_properties(self.request) @property - def device(self): + def _device(self) -> str: return self.parent_model._device + @property + def device(self) -> torch.device: + return self.parent_model.device + class OVModelTextEncoder(OVModelPart): def __init__( @@ -715,7 +716,7 @@ def __call__(self, latent_sample: np.ndarray): return list(outputs.values()) def _compile(self): - if "GPU" in self.device: + if "GPU" in self._device: self.ov_config.update({"INFERENCE_PRECISION_HINT": "f32"}) super()._compile() @@ -736,7 +737,7 @@ def __call__(self, sample: np.ndarray): return list(outputs.values()) def _compile(self): - if "GPU" in self.device: + if "GPU" in self._device: self.ov_config.update({"INFERENCE_PRECISION_HINT": "f32"}) super()._compile() diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index 6d72dc7b0e..629fc41855 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import logging import os from pathlib import Path @@ -34,14 +33,18 @@ ) from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.generation import GenerationMixin -from transformers.generation.logits_process import WhisperTimeStampLogitsProcessor from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput -from transformers.models.whisper.tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE +from ..utils import is_transformers_version from .modeling_base_seq2seq import OVBaseModelForSeq2SeqLM -from .utils import _print_compiled_model_properties +from .utils import OV_TO_PT_TYPE, _print_compiled_model_properties +if is_transformers_version(">=", "4.43.0"): + from transformers.cache_utils import EncoderDecoderCache +else: + EncoderDecoderCache = dict + if TYPE_CHECKING: from transformers import PretrainedConfig @@ -320,7 +323,7 @@ def __init__( super().__init__( encoder=encoder, decoder=decoder, decoder_with_past=decoder_with_past, config=config, **kwargs ) - self.device = torch.device("cpu") + self.decoder_with_past = None enable_compilation = kwargs.get("compile", True) self.encoder = OVEncoder(self.encoder_model, parent_model=self) @@ -341,16 +344,16 @@ def __init__( def to(self, device: str): if isinstance(device, str): self._device = device.upper() - self.encoder._device = self._device - self.decoder._device = self._device - if self.use_cache: - self.decoder_with_past._device = self._device self.clear_requests() else: logger.debug(f"device must be of type {str} but got {type(device)} instead") return self + @property + def dtype(self) -> Optional[torch.dtype]: + return self.encoder.dtype or self.decoder.dtype + @add_start_docstrings_to_model_forward( SEQ2SEQ_MODEL_DOCSTRING.format("batch_size, sequence_length") + TRANSLATION_EXAMPLE.format( @@ -367,6 +370,7 @@ def forward( decoder_attention_mask: Optional[torch.LongTensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + cache_position: Optional[torch.LongTensor] = None, **kwargs, ) -> Seq2SeqLMOutput: # Encode if needed : first prediction pass @@ -388,6 +392,7 @@ def forward( encoder_hidden_states=encoder_outputs.last_hidden_state, encoder_attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, + cache_position=cache_position, ) return Seq2SeqLMOutput(logits=decoder_outputs.logits, past_key_values=decoder_outputs.past_key_values) @@ -476,12 +481,34 @@ class OVEncoder: def __init__(self, model: openvino.runtime.Model, parent_model: OVModelForSeq2SeqLM): self.model = model self.parent_model = parent_model - self._device = self.parent_model._device - self.device = torch.device("cpu") self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)} + self.input_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.inputs} + self.output_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.outputs} self.main_input_name = self.parent_model.main_input_name or "input_ids" self.request = None + @property + def _device(self): + return self.parent_model._device + + @property + def device(self): + return self.parent_model.device + + @property + def dtype(self) -> Optional[torch.dtype]: + for dtype in self.input_dtypes.values(): + torch_dtype = OV_TO_PT_TYPE.get(dtype) + if torch_dtype.is_floating_point: + return torch_dtype + + for dtype in self.output_dtypes.values(): + torch_dtype = OV_TO_PT_TYPE.get(dtype) + if torch_dtype.is_floating_point: + return torch_dtype + + return None + @add_start_docstrings_to_model_forward(ENCODER_INPUTS_DOCSTRING) def forward( self, @@ -541,11 +568,11 @@ class OVDecoder: def __init__(self, model: openvino.runtime.Model, parent_model: OVModelForSeq2SeqLM): self.model = model self.parent_model = parent_model - self._device = self.parent_model._device - self.device = torch.device("cpu") self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)} + self.input_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.inputs} self.key_value_input_names = [key for key in self.input_names if "key_values" in key] self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)} + self.output_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.outputs} self.key_value_output_names = [key for key in self.output_names if "key_values" in key or "present" in key] is_legacy = any("past_key_values" in key.get_any_name() for key in self.model.outputs) @@ -558,6 +585,28 @@ def __init__(self, model: openvino.runtime.Model, parent_model: OVModelForSeq2Se self.request = None + @property + def _device(self) -> str: + return self.parent_model._device + + @property + def device(self) -> torch.device: + return self.parent_model.device + + @property + def dtype(self) -> Optional[torch.dtype]: + for dtype in self.input_dtypes.values(): + torch_dtype = OV_TO_PT_TYPE.get(dtype) + if torch_dtype.is_floating_point: + return torch_dtype + + for dtype in self.output_dtypes.values(): + torch_dtype = OV_TO_PT_TYPE.get(dtype) + if torch_dtype.is_floating_point: + return torch_dtype + + return None + @add_start_docstrings_to_model_forward(DECODER_INPUTS_DOCSTRING) def forward( self, @@ -566,6 +615,7 @@ def forward( encoder_attention_mask: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, decoder_attention_mask: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Seq2SeqLMOutput: self._compile() # Model inputs @@ -592,6 +642,10 @@ def forward( if "decoder_attention_mask" in self.input_names and decoder_attention_mask is not None: inputs["decoder_attention_mask"] = decoder_attention_mask + + if "cache_position" in self.input_names and cache_position is not None: + inputs["cache_position"] = cache_position + # Run inference self.request.start_async(inputs, share_inputs=True) self.request.wait() @@ -832,26 +886,29 @@ class OVModelForSpeechSeq2Seq(OVModelForSeq2SeqLM): def prepare_inputs_for_generation( self, - input_ids, - attention_mask: Optional[torch.LongTensor] = None, - decoder_attention_mask: Optional[torch.BoolTensor] = None, + decoder_input_ids, past_key_values=None, + attention_mask=None, head_mask=None, decoder_head_mask=None, cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, + decoder_attention_mask=None, **kwargs, - ) -> Dict: - if decoder_attention_mask is None: - decoder_attention_mask = torch.ones_like(input_ids).to(input_ids.device) + ): + # cut decoder_input_ids if past is used + if past_key_values is not None: + decoder_input_ids = decoder_input_ids[:, -1:] + + if decoder_attention_mask is None and decoder_input_ids is not None: + decoder_attention_mask = torch.ones_like(decoder_input_ids).to(decoder_input_ids.device) return { - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, "encoder_outputs": encoder_outputs, + "past_key_values": past_key_values, + "decoder_input_ids": decoder_input_ids, "attention_mask": attention_mask, - "decoder_attention_mask": decoder_attention_mask, "head_mask": head_mask, "decoder_head_mask": decoder_head_mask, "cross_attn_head_mask": cross_attn_head_mask, @@ -874,6 +931,7 @@ def forward( decoder_attention_mask: Optional[torch.BoolTensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + cache_position: Optional[torch.LongTensor] = None, **kwargs, ) -> Seq2SeqLMOutput: return super().forward( @@ -883,6 +941,7 @@ def forward( decoder_attention_mask=decoder_attention_mask, encoder_outputs=encoder_outputs, past_key_values=past_key_values, + cache_position=cache_position, **kwargs, ) @@ -899,13 +958,17 @@ def _from_pretrained( return super()._from_pretrained(model_id, config, **kwargs) -class _OVModelForWhisper(OVModelForSpeechSeq2Seq): +class _OVModelForWhisper(OVModelForSpeechSeq2Seq, WhisperForConditionalGeneration): """ Whisper implements its own generate() method. """ auto_model_class = WhisperForConditionalGeneration + # force the use of the WhisperForConditionalGeneration generate and prepare_inputs_for_generation methods + prepare_inputs_for_generation = WhisperForConditionalGeneration.prepare_inputs_for_generation + generate = WhisperForConditionalGeneration.generate + @classmethod def _from_pretrained( cls, @@ -915,415 +978,19 @@ def _from_pretrained( ): return super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(model_id, config, **kwargs) - # Adapted from transformers.models.whisper.modeling_whisper - def generate( - self, - input_features: Optional[torch.Tensor] = None, - generation_config=None, - logits_processor=None, - stopping_criteria=None, - prefix_allowed_tokens_fn=None, - synced_gpus=False, - return_timestamps=None, - task=None, - language=None, - is_multilingual=None, - prompt_ids: Optional[torch.Tensor] = None, - num_segment_frames: Optional[int] = None, - return_token_timestamps: Optional[bool] = None, - return_segments: bool = False, - attention_mask: Optional[torch.Tensor] = None, - time_precision: int = 0.02, - return_dict_in_generate: Optional[bool] = None, - **kwargs, - ): - if "inputs" in kwargs: - input_features = kwargs.pop("inputs") - logging.warn( - "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.", - FutureWarning, - ) - - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.generation_config.return_dict_in_generate - ) - - if generation_config is None: - generation_config = copy.deepcopy(self.generation_config) - - input_stride = ( - 1 * 2 - ) # NOTE: replaced from `self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]` - if num_segment_frames is None: - num_segment_frames = input_stride * self.config.max_source_positions - - # 1. Check whether we're in shortform or longform mode - if input_features is not None: - total_input_frames = input_features.shape[-1] - elif "encoder_outputs" in kwargs: - encoder_outputs_shape = ( - kwargs["encoder_outputs"][0].shape - if isinstance(kwargs["encoder_outputs"], BaseModelOutput) - else kwargs["encoder_outputs"].shape - ) - total_input_frames = encoder_outputs_shape[1] * input_stride - else: - raise ValueError("Make sure to provide either `input_features` or `encoder_outputs` to `generate`.") - - is_shortform = total_input_frames <= num_segment_frames - - # 2. Make sure the generation config is correctly set depending on whether timestamps are to be returned or not - if return_timestamps is True: - if not hasattr(generation_config, "no_timestamps_token_id"): - raise ValueError( - "You are trying to return timestamps, but the generation config is not properly set. " - "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. " - "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363" - ) - generation_config.return_timestamps = return_timestamps - elif not is_shortform: - if return_timestamps is False: - raise ValueError( - "You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which " - "requires the model to predict timestamp tokens. Please either pass `return_timestamps=True` or make sure to pass no more than 3000 mel input features." - ) - - if not hasattr(generation_config, "no_timestamps_token_id"): - raise ValueError( - "You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which " - "requires the generation config to have `no_timestamps_token_id` correctly. " - "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. " - "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363" - "or make sure to pass no more than 3000 mel input features." - ) - - logger.info("Setting `return_timestamps=True` for long-form generation.") - generation_config.return_timestamps = True - else: - generation_config.return_timestamps = False - - # 3. Make sure to correctly set language-related parameters - if is_multilingual is not None: - if not hasattr(generation_config, "is_multilingual"): - raise ValueError( - "The generation config is outdated and is thus not compatible with the `is_multilingual` argument " - "to `generate`. Please update the generation config as per the instructions " - "https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224" - ) - generation_config.is_multilingual = is_multilingual - - if hasattr(generation_config, "is_multilingual") and not generation_config.is_multilingual: - if task is not None or language is not None: - raise ValueError( - "Cannot specify `task` or `language` for an English-only model. If the model is intended to be " - "multilingual, pass `is_multilingual=True` to generate, or update the generation config." - ) - - if language is not None: - if not hasattr(generation_config, "lang_to_id"): - raise ValueError( - "The generation config is outdated and is thus not compatible with the `language` argument " - "to `generate`. Either set the language using the `forced_decoder_ids` in the model config, " - "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224" - ) - language = language.lower() - generation_config.language = language - if task is not None: - if not hasattr(generation_config, "task_to_id"): - raise ValueError( - "The generation config is outdated and is thus not compatible with the `task` argument " - "to `generate`. Either set the task using the `forced_decoder_ids` in the model config, " - "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224" - ) - generation_config.task = task - - # 4. Add forced decoder ids depending on passed `language`, `task`,`prompt_ids`, `return_token_timestamps` and `return_timestamps` - forced_decoder_ids = None - # Legacy code for backward compatibility - if hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids is not None: - forced_decoder_ids = self.config.forced_decoder_ids - elif ( - hasattr(self.generation_config, "forced_decoder_ids") - and self.generation_config.forced_decoder_ids is not None - ): - forced_decoder_ids = self.generation_config.forced_decoder_ids - else: - forced_decoder_ids = kwargs.get("forced_decoder_ids", None) - - if task is not None or language is not None or (forced_decoder_ids is None and prompt_ids is not None): - forced_decoder_ids = [] - if hasattr(generation_config, "language"): - if generation_config.language in generation_config.lang_to_id.keys(): - language_token = generation_config.language - elif generation_config.language in TO_LANGUAGE_CODE.keys(): - language_token = f"<|{TO_LANGUAGE_CODE[generation_config.language]}|>" - elif generation_config.language in TO_LANGUAGE_CODE.values(): - language_token = f"<|{generation_config.language}|>" - else: - is_language_code = len(generation_config.language) == 2 - raise ValueError( - f"Unsupported language: {generation_config.language}. Language should be one of:" - f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}." - ) - forced_decoder_ids.append((1, generation_config.lang_to_id[language_token])) - else: - forced_decoder_ids.append((1, None)) # automatically detect the language - - if hasattr(generation_config, "task"): - if generation_config.task in TASK_IDS: - forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task])) - else: - raise ValueError( - f"The `{generation_config.task}`task is not supported. The task should be one of `{TASK_IDS}`" - ) - elif hasattr(generation_config, "task_to_id"): - forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"])) # defaults to transcribe - if hasattr(generation_config, "no_timestamps_token_id") and not generation_config.return_timestamps: - idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1 - forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id)) - - if forced_decoder_ids is not None: - generation_config.forced_decoder_ids = forced_decoder_ids - - if prompt_ids is not None: - if kwargs.get("decoder_start_token_id") is not None: - raise ValueError( - "When specifying `prompt_ids`, you cannot also specify `decoder_start_token_id` as it gets overwritten." - ) - prompt_ids = prompt_ids.tolist() - decoder_start_token_id, *text_prompt_ids = prompt_ids - # Slicing the text prompt ids in a manner consistent with the OpenAI implementation - # to accomodate context space for the prefix (see https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/decoding.py#L599) - text_prompt_ids = text_prompt_ids[-self.config.max_target_positions // 2 - 1 :] - # Set the decoder_start_token_id to <|startofprev|> - kwargs.update({"decoder_start_token_id": decoder_start_token_id}) - - # If the user passes `max_new_tokens`, increase its number to account for the prompt - if kwargs.get("max_new_tokens", None) is not None: - kwargs["max_new_tokens"] += len(text_prompt_ids) - if kwargs["max_new_tokens"] >= self.config.max_target_positions: - raise ValueError( - f"The length of the sliced `prompt_ids` is {len(text_prompt_ids)}, and the `max_new_tokens` " - f"{kwargs['max_new_tokens'] - len(text_prompt_ids)}. Thus, the combined length of the sliced " - f"`prompt_ids` and `max_new_tokens` is: {kwargs['max_new_tokens']}. This exceeds the " - f"`max_target_positions` of the Whisper model: {self.config.max_target_positions}. " - "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, " - f"so that their combined length is less that {self.config.max_target_positions}." - ) - - # Reformat the forced_decoder_ids to incorporate the prompt - non_prompt_forced_decoder_ids = ( - kwargs.pop("forced_decoder_ids", None) or generation_config.forced_decoder_ids - ) - forced_decoder_ids = [ - *text_prompt_ids, - generation_config.decoder_start_token_id, - *[token for _rank, token in non_prompt_forced_decoder_ids], - ] - forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_decoder_ids)] - generation_config.forced_decoder_ids = forced_decoder_ids - - if return_token_timestamps: - kwargs["output_attentions"] = True - return_dict_in_generate = True - - if getattr(generation_config, "task", None) == "translate": - logger.warning("Token-level timestamps may not be reliable for task 'translate'.") - if not hasattr(generation_config, "alignment_heads"): - raise ValueError( - "Model generation config has no `alignment_heads`, token-level timestamps not available. " - "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config." - ) - - if kwargs.get("num_frames") is not None: - generation_config.num_frames = kwargs.pop("num_frames") - - if generation_config.return_timestamps is True: - last_forced_decoder_ids = ( - generation_config.forced_decoder_ids[-1][-1] - if hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids - else None - ) - if last_forced_decoder_ids == self.generation_config.no_timestamps_token_id: - # remove no_timestamp to be forcefully generated if we want to return timestamps - # this is also important to make sure `WhisperTimeStampLogitsProcessor` functions correctly - forced_decoder_ids = generation_config.forced_decoder_ids[:-1] - # Make sure that if list is empty we set it to None - generation_config.forced_decoder_ids = None if len(forced_decoder_ids) == 0 else forced_decoder_ids - - timestamp_processor = [WhisperTimeStampLogitsProcessor(generation_config)] - logits_processor = ( - timestamp_processor if logits_processor is None else timestamp_processor + logits_processor - ) - - # 5. If we're in shortform mode, simple generate the whole input at once and return the output - if is_shortform: - outputs = super().generate( - input_features, - generation_config, - logits_processor, - stopping_criteria, - prefix_allowed_tokens_fn, - synced_gpus, - return_dict_in_generate=return_dict_in_generate, - **kwargs, - ) - - if return_token_timestamps and hasattr(generation_config, "alignment_heads"): - num_frames = getattr(generation_config, "num_frames", None) - outputs["token_timestamps"] = self._extract_token_timestamps( - outputs, generation_config.alignment_heads, num_frames=num_frames - ) - - return outputs - - # 6. Else we're in longform mode which is more complex. We need to chunk the audio input depending on when the model generated - # timestamp tokens - # 6.1 Set running parameters for while loop - if not return_segments and return_dict_in_generate: - raise ValueError( - "Make sure to set `return_segments=True` to return generation outputs as part of the `'segments' key.`" - ) - - # if input is longer than 30 seconds we default to long-form generation - timestamp_begin = self.generation_config.no_timestamps_token_id + 1 - # input stride is mel frames per encoder output vector which is the product of all conv strides - batch_size = input_features.shape[0] - - if batch_size > 1 and attention_mask is None: - raise ValueError( - "When doing long-form audio transcription, make sure to pass an `attention_mask`. You can retrieve the `attention_mask` by doing `processor(audio, ..., return_attention_mask=True)` " - ) - elif batch_size > 1: - max_frames = attention_mask.sum(-1).cpu().to(torch.long) - seek = torch.zeros((batch_size,), dtype=torch.long) - else: - max_frames = torch.ones((1,), dtype=torch.long) * total_input_frames - seek = torch.zeros((1,), dtype=torch.long) - - current_segments = [[] for _ in range(batch_size)] - cur_to_prev_index_map = list(range(batch_size)) - - # batch size can decrease during the run - cur_bsz = prev_bsz = batch_size - - # 6.2 Transcribe audio until we reach the end of all input audios - while (seek < max_frames).any(): - prev_bsz = cur_bsz - - # 6.3 NOTE: When in longform transcription mode and batch size > 1 we need to dynamically reduce the batch size during the loop - # in case one audio finished earlier than another one. Thus, we need to keep a table of "previous-index-2-current-index" in order - # to know which original audio is being decoded - new_cur_to_prev_index_map = [] - for i in range(prev_bsz): - prev_i = cur_to_prev_index_map[i] - if seek[prev_i] >= max_frames[prev_i]: - cut_index = i + (cur_bsz - prev_bsz) - cur_bsz -= 1 - input_features = torch.cat([input_features[:cut_index], input_features[cut_index + 1 :]], dim=0) - else: - # cut out index that goes away - new_cur_to_prev_index_map.append(prev_i) - - # 6.4 Set updated index map, duration of previously decoded chunks and number of max frames of current decoding chunk - cur_to_prev_index_map = new_cur_to_prev_index_map - time_offset = seek * time_precision / input_stride - seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames) - - # 6.5 Make sure that all inputs are padded to the same input length - segment_input = [] - for i in range(cur_bsz): - prev_i = cur_to_prev_index_map[i] - segment_input_slice = input_features[ - i : i + 1, :, seek[prev_i] : seek[prev_i] + seek_num_frames[prev_i] - ] - - if segment_input_slice.shape[-1] < num_segment_frames: - # pad to 3000 if necessary - segment_input_slice = torch.nn.functional.pad( - segment_input_slice, pad=(0, num_segment_frames - segment_input_slice.shape[-1]) - ) - - segment_input.append(segment_input_slice) - - segment_input = torch.cat(segment_input, dim=0) - - # 6.6 Batch generate current chunk - seek_outputs = super().generate( - segment_input, - generation_config, - logits_processor, - stopping_criteria, - prefix_allowed_tokens_fn, - synced_gpus, - return_dict_in_generate=return_dict_in_generate, - **kwargs, - ) - - if return_token_timestamps and hasattr(generation_config, "alignment_heads"): - num_frames = getattr(generation_config, "num_frames", None) - seek_outputs["token_timestamps"] = self._extract_token_timestamps( - seek_outputs, generation_config.alignment_heads, num_frames=num_frames - ) - - if return_dict_in_generate: - seek_sequences = seek_outputs["sequences"] - seek_outputs = [ - {k: v[i] for k, v in seek_outputs.items()} - for i in range(next(iter(seek_outputs.values())).size(0)) - ] - else: - seek_sequences = seek_outputs - - # 6.7 Loop over each decoded audio individually as each decoding can be of a different length - for i, seek_sequence in enumerate(seek_sequences): - prev_i = cur_to_prev_index_map[i] - - # make sure we cut a predicted EOS token if we are not finished with the generation yet - is_not_final = (seek[prev_i] + num_segment_frames) < max_frames[prev_i] - if is_not_final and seek_sequence[-1] == self.generation_config.eos_token_id: - seek_sequence = seek_sequence[:-1] - - # remove all padding tokens - if seek_sequence[-1] == self.generation_config.pad_token_id: - num_paddings = (seek_sequence == self.generation_config.pad_token_id).sum() - seek_sequence = seek_sequence[:-num_paddings] - - segments, segment_offset = self._retrieve_segment( - seek_sequence=seek_sequence, - seek_outputs=seek_outputs, - time_offset=time_offset, - timestamp_begin=timestamp_begin, - seek_num_frames=seek_num_frames, - cur_bsz=cur_bsz, - time_precision=time_precision, - input_stride=input_stride, - prev_idx=prev_i, - idx=i, - ) - - current_segments[prev_i] += segments - seek[prev_i] += segment_offset - - # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted - # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output - sequences = [] - max_total_length = 0 - for current_segment_list in current_segments: - sequences.append(torch.cat([d["tokens"] for d in current_segment_list], dim=-1)) - max_total_length = max(max_total_length, len(sequences[-1])) - - for i in range(batch_size): - sequences[i] = torch.nn.functional.pad( - sequences[i], pad=(0, max_total_length - len(sequences[i])), value=self.generation_config.pad_token_id - ) + class DummyWhisperModel: + def __init__(self): + self.encoder = self.Encoder() - sequences = torch.stack(sequences, dim=0) + class Encoder: + def __init__(self): + self.conv1 = self.Conv(stride=(1,)) + self.conv2 = self.Conv(stride=(2,)) - # 8. If we return all segments, the predicted output sequences are put under `"sequences"`. - if return_segments: - return {"sequences": sequences, "segments": current_segments} + class Conv: + def __init__(self, stride): + self.stride = stride - return sequences + # a dummy model attribute that's used in the generate method to compute the input stride + # input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0] + model = DummyWhisperModel() diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 69a750fb65..06ad451237 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -21,6 +21,7 @@ from typing import Tuple, Union import numpy as np +import torch from huggingface_hub import model_info from openvino.runtime import Core, Type, properties from transformers import AutoTokenizer, CLIPTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast @@ -52,7 +53,6 @@ TEXTUAL_INVERSION_NAME_SAFE = "learned_embeds.safetensors" TEXTUAL_INVERSION_EMBEDDING_KEY = "text_model.embeddings.token_embedding.weight" - OV_TO_NP_TYPE = { "boolean": np.bool_, "i8": np.int8, @@ -68,6 +68,21 @@ "f64": np.float64, } +OV_TO_PT_TYPE = { + "boolean": torch.bool, + "i8": torch.int8, + "u8": torch.uint8, + "i16": torch.int16, + "u16": torch.uint16, + "i32": torch.int32, + "u32": torch.uint32, + "i64": torch.int64, + "u64": torch.uint64, + "f16": torch.float16, + "f32": torch.float32, + "f64": torch.float64, +} + STR_TO_OV_TYPE = { "boolean": Type.boolean, diff --git a/setup.py b/setup.py index 8abcbfaf08..853d012e54 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,8 @@ INSTALL_REQUIRE = [ "torch>=1.11", - "transformers>=4.36.0,<4.43.0", - "optimum>=1.21.2,<1.22.0", + "transformers>=4.36.0,<4.44.0", + "optimum@git+https://github.com/huggingface/optimum.git@v1.21.3-release", "datasets>=1.4.0", "sentencepiece", "setuptools", @@ -59,10 +59,10 @@ QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"] EXTRAS_REQUIRE = { - "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"], + "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate", "transformers<4.43.0"], "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.11.0"], - "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<=4.41.2"], + "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<4.44.0"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index b6635ca154..8e56dd3957 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -103,8 +103,8 @@ def test_compare_to_transformers(self, model_arch): for output_name in {"logits", "last_hidden_state"}: if output_name in transformers_outputs: self.assertTrue(torch.allclose(outputs[output_name], transformers_outputs[output_name], atol=1e-4)) - self.assertTrue(torch.equal(outputs[output_name], loaded_model_outputs[output_name])) - self.assertTrue(torch.equal(outputs[output_name], init_model_outputs[output_name])) + self.assertTrue(torch.allclose(outputs[output_name], loaded_model_outputs[output_name])) + self.assertTrue(torch.allclose(outputs[output_name], init_model_outputs[output_name])) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): @@ -259,7 +259,7 @@ def test_pipeline(self, model_arch): model.config.encoder_no_repeat_ngram_size = 0 model.to("cpu") pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) - outputs = pipe("This is a sample", max_length=10) + outputs = pipe("This is a sample", max_new_tokens=10) self.assertEqual(pipe.device, model.device) self.assertTrue(all("This is a sample" in item["generated_text"] for item in outputs)) @@ -330,7 +330,7 @@ def test_compare_with_and_without_past_key_values(self): model_with_pkv.generate(**tokens) with Timer() as with_pkv_timer: outputs_model_with_pkv = model_with_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 + **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 ) model_without_pkv = IPEXModelForCausalLM.from_pretrained( model_id, use_cache=False, subfolder="model_without_pkv" @@ -339,16 +339,11 @@ def test_compare_with_and_without_past_key_values(self): model_without_pkv.generate(**tokens) with Timer() as without_pkv_timer: outputs_model_without_pkv = model_without_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 + **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 ) self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) - self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) - self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) - # self.assertTrue( - # without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE, - # f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms," - # f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}", - # ) + self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH + tokens.input_ids.shape[1]) + self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH + tokens.input_ids.shape[1]) class IPEXModelForAudioClassificationTest(unittest.TestCase): diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py index c4ae471a0f..767097a5dd 100644 --- a/tests/ipex/test_pipelines.py +++ b/tests/ipex/test_pipelines.py @@ -137,9 +137,9 @@ def test_text_generation_pipeline_inference(self, model_arch): ipex_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex") inputs = "Describe a real-world application of AI." with torch.inference_mode(): - transformers_output = transformers_generator(inputs) + transformers_output = transformers_generator(inputs, max_new_tokens=10) with torch.inference_mode(): - ipex_output = ipex_generator(inputs) + ipex_output = ipex_generator(inputs, max_new_tokens=10) self.assertTrue(isinstance(ipex_generator.model, IPEXModelForCausalLM)) self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) self.assertEqual(transformers_output[0]["generated_text"], ipex_output[0]["generated_text"]) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 839dd55add..79c3920e08 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -241,6 +241,7 @@ def test_exporters_cli_int4_with_local_model_and_default_config(self): # overload for matching with default configuration pt_model.config._name_or_path = "tiiuae/falcon-7b-instruct" pt_model.save_pretrained(tmpdir) + subprocess.run( f"optimum-cli export openvino --model {tmpdir} --task text-generation-with-past --weight-format int4 {tmpdir}", shell=True, @@ -251,16 +252,23 @@ def test_exporters_cli_int4_with_local_model_and_default_config(self): rt_info = model.model.get_rt_info() self.assertTrue("nncf" in rt_info) self.assertTrue("weight_compression" in rt_info["nncf"]) - default_config = _DEFAULT_4BIT_CONFIGS["tiiuae/falcon-7b-instruct"] model_weight_compression_config = rt_info["nncf"]["weight_compression"] - sym = default_config.pop("sym", False) + + default_config = _DEFAULT_4BIT_CONFIGS["tiiuae/falcon-7b-instruct"] bits = default_config.pop("bits", None) self.assertEqual(bits, 4) - mode = f'int{bits}_{"sym" if sym else "asym"}' - default_config["mode"] = mode + sym = default_config.pop("sym", False) + default_config["mode"] = f'int{bits}_{"sym" if sym else "asym"}' + + quant_method = default_config.pop("quant_method", None) + default_config["awq"] = quant_method == "awq" + default_config["gptq"] = quant_method == "gptq" + + default_config.pop("dataset", None) + for key, value in default_config.items(): - self.assertTrue(key in model_weight_compression_config) + self.assertIn(key, model_weight_compression_config) self.assertEqual( model_weight_compression_config[key].value, str(value), diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 26071be2eb..3eb7feaecc 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -941,7 +941,7 @@ def test_beam_search(self, model_arch): eos_token_id=None, ) - if model_arch == "minicpm": + if model_arch in ["minicpm", "internlm2"]: beam_sample_gen_config.top_k = 1 group_beam_search_gen_config = GenerationConfig( @@ -970,12 +970,15 @@ def test_beam_search(self, model_arch): group_beam_search_gen_config, constrained_beam_search_gen_config, ] + set_seed(SEED) ov_model_stateful = OVModelForCausalLM.from_pretrained( model_id, export=True, use_cache=True, stateful=True, **model_kwargs ) + set_seed(SEED) ov_model_stateless = OVModelForCausalLM.from_pretrained( model_id, export=True, use_cache=True, stateful=False, **model_kwargs ) + set_seed(SEED) transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) if model_arch == "arctic": @@ -1662,15 +1665,15 @@ def _generate_random_audio_data(self): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): - model_id = MODEL_NAMES[model_arch] set_seed(SEED) + model_id = MODEL_NAMES[model_arch] + transformers_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id) ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) self.assertIsInstance(ov_model.config, PretrainedConfig) - transformers_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id) + processor = get_preprocessor(model_id) data = self._generate_random_audio_data() features = processor.feature_extractor(data, return_tensors="pt") - decoder_start_token_id = transformers_model.config.decoder_start_token_id decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id} @@ -1699,7 +1702,7 @@ def test_pipeline(self, model_arch): set_seed(SEED) model_id = MODEL_NAMES[model_arch] model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True) - model.eval() + processor = get_preprocessor(model_id) pipe = pipeline( "automatic-speech-recognition", diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index ac158b69bf..d5f37c5f17 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -897,12 +897,16 @@ def test_calibration_data_uniqueness(self, model_id, apply_caching): ) for _ in range(2): input_features = self._generate_random_audio_data(processor) - ov_model.generate(input_features) + ov_model.generate(input_features, max_new_tokens=10, min_new_tokens=10) data_hashes_per_key = defaultdict(list) data_id_per_key = defaultdict(set) + for inputs_dict in calibration_data: for k, v in inputs_dict.items(): + if k == "input_ids": + continue + x = (v.numpy() if isinstance(v, torch.Tensor) else v).copy() data_hashes_per_key[k].append(hash(x.tobytes())) data_id_per_key[k].add(id(v)) @@ -911,7 +915,7 @@ def test_calibration_data_uniqueness(self, model_id, apply_caching): self.assertTrue(any(data_hashes[0] != it for it in data_hashes)) if apply_caching: # With caching, encoder hidden states tensors should be cached, resulting in only 2 tensors stored - self.assertTrue(len(data_id_per_key["encoder_hidden_states"]) == 2) + self.assertEqual(len(data_id_per_key["encoder_hidden_states"]), 2) else: # Without caching, encoder hidden states tensors will be unique for each collected input - self.assertTrue(len(data_id_per_key["encoder_hidden_states"]) > 2) + self.assertGreater(len(data_id_per_key["encoder_hidden_states"]), 2) diff --git a/tests/openvino/test_stable_diffusion.py b/tests/openvino/test_stable_diffusion.py index e735a07fb4..cb68c75908 100644 --- a/tests/openvino/test_stable_diffusion.py +++ b/tests/openvino/test_stable_diffusion.py @@ -248,7 +248,7 @@ def test_compare_to_diffusers(self, model_arch: str): self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4)) # Compare model devices - self.assertEqual(pipeline.device.type, ov_pipeline.device) + self.assertEqual(pipeline.device, ov_pipeline.device) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_image_reproducibility(self, model_arch: str): @@ -406,7 +406,7 @@ def test_compare_to_diffusers(self, model_arch: str): # Compare model outputs self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4)) # Compare model devices - self.assertEqual(pipeline.device.type, ov_pipeline.device) + self.assertEqual(pipeline.device, ov_pipeline.device) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_image_reproducibility(self, model_arch: str): @@ -536,7 +536,7 @@ def test_compare_to_diffusers(self, model_arch: str): # Compare model outputs self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4)) # Compare model devices - self.assertEqual(pipeline.device.type, ov_pipeline.device) + self.assertEqual(pipeline.device, ov_pipeline.device) @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow