diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 95ecea1213..a4534a5ada 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -92,6 +92,17 @@ def parse_args_openvino(parser: "ArgumentParser"): "precision (by default 20% in INT8). This helps to achieve better accuracy after weight quantization." ), ) + optional_group.add_argument( + "--disable-stateful", + action="store_true", + help=( + "Disable stateful converted models, stateless models will be generated instead. Stateful models are produced by default when this key is not used. " + "In stateful models all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. " + "If --disable-stateful option is used, it may result in sub-optimal inference performance. " + "Use it when you intentionally want to use a stateless model, for example, to be compatible with existing " + "OpenVINO native inference code that expects kv-cache inputs and outputs in the model." + ), + ) class OVExportCommand(BaseOptimumCLICommand): @@ -138,6 +149,7 @@ def run(self): trust_remote_code=self.args.trust_remote_code, pad_token_id=self.args.pad_token_id, compression_option=self.args.weight_format, - compression_ratio=self.args.ratio + compression_ratio=self.args.ratio, + stateful=not self.args.disable_stateful, # **input_shapes, ) diff --git a/optimum/exporters/openvino/__init__.py b/optimum/exporters/openvino/__init__.py index d87d8dda9e..6fd7970a07 100644 --- a/optimum/exporters/openvino/__init__.py +++ b/optimum/exporters/openvino/__init__.py @@ -1,5 +1,6 @@ from .__main__ import main_export from .convert import export, export_models, export_pytorch_via_onnx +from .stateful import ensure_stateful_is_available, patch_stateful __all__ = ["main_export", "export", "export_models"] diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 54fe1193e5..750005802c 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -28,6 +28,7 @@ from ...intel.utils.import_utils import is_nncf_available, is_optimum_version, is_transformers_version from .convert import export_models +from .stateful import ensure_export_task_support_stateful if is_optimum_version(">=", "1.16.0"): @@ -65,6 +66,7 @@ def main_export( fn_get_submodels: Optional[Callable] = None, compression_option: Optional[str] = None, compression_ratio: Optional[float] = None, + stateful: bool = True, **kwargs_shapes, ): """ @@ -124,6 +126,8 @@ def main_export( `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point, `f32` - means no compression. compression_ratio (`Optional[float]`, defaults to `None`): Compression ratio between primary and backup precision (only relevant to INT4). + stateful (`bool`, defaults to `True`): + Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models. **kwargs_shapes (`Dict`): Shapes to use during inference. This argument allows to override the default shapes used during the ONNX export. @@ -277,6 +281,9 @@ class StoreAttr(object): possible_synonyms = "" logger.info(f"Automatic task detection to {task}{possible_synonyms}.") + task_support_stateful = ensure_export_task_support_stateful(task) + stateful = stateful and task_support_stateful + preprocessors = maybe_load_preprocessors( model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code ) @@ -373,6 +380,7 @@ class StoreAttr(object): device=device, compression_option=compression_option, compression_ratio=compression_ratio, + stateful=stateful, model_kwargs=model_kwargs, ) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 56c5a10e5d..947d8bd989 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -32,6 +32,8 @@ from optimum.utils import is_diffusers_available from ...intel.utils.import_utils import is_nncf_available, is_optimum_version +from .model_patcher import patch_model_with_bettertransformer +from .stateful import ensure_stateful_is_available, patch_stateful from .utils import ( OV_XML_FILE_NAME, clear_class_registry, @@ -102,6 +104,7 @@ def export( model_kwargs: Optional[Dict[str, Any]] = None, compression_option: Optional[str] = None, compression_ratio: Optional[float] = None, + stateful: bool = True, ) -> Tuple[List[str], List[str]]: """ Exports a Pytorch or TensorFlow model to an OpenVINO Intermediate Representation. @@ -125,6 +128,8 @@ def export( Compression ratio between primary and backup precision (only relevant to INT4). input_shapes (`Optional[Dict]`, defaults to `None`): If specified, allows to use specific shapes for the example input provided to the exporter. + stateful (`bool`, defaults to `True`): + Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models. Returns: `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from @@ -139,6 +144,10 @@ def export( if "diffusers" in str(model.__class__) and not is_diffusers_available(): raise ImportError("The pip package `diffusers` is required to export stable diffusion models to ONNX.") + if stateful: + # This will be checked anyway after the model conversion, but checking it earlier will save time for a user if not suitable version is used + stateful = ensure_stateful_is_available() + if is_torch_available() and isinstance(model, nn.Module): return export_pytorch( model, @@ -150,6 +159,7 @@ def export( compression_option=compression_option, compression_ratio=compression_ratio, model_kwargs=model_kwargs, + stateful=stateful, ) elif is_tf_available() and issubclass(type(model), TFPreTrainedModel): @@ -160,7 +170,9 @@ def export( raise RuntimeError("`tf2onnx` does not support export on CUDA device.") if input_shapes is not None: logger.info("`input_shapes` argument is not supported by the Tensorflow ONNX export and will be ignored.") - return export_tensorflow(model, config, opset, output) + return export_tensorflow( + model, config, opset, output, compression_option=compression_option, compression_ratio=compression_ratio + ) else: raise RuntimeError( @@ -271,6 +283,7 @@ def export_pytorch( model_kwargs: Optional[Dict[str, Any]] = None, compression_option: Optional[str] = None, compression_ratio: Optional[float] = None, + stateful: bool = False, ) -> Tuple[List[str], List[str]]: """ Exports a PyTorch model to an OpenVINO Intermediate Representation. @@ -291,6 +304,13 @@ def export_pytorch( If specified, allows to use specific shapes for the example input provided to the exporter. model_kwargs (optional[Dict[str, Any]], defaults to `None`): Additional kwargs for model export + compression_option (`Optional[str]`, defaults to `None`): + The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point, + `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point. + compression_ratio (`Optional[float]`, defaults to `None`): + Compression ratio between primary and backup precision (only relevant to INT4). + stateful (`bool`, defaults to `False`): + Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models. Returns: `Tuple[List[str], List[str], bool]`: A tuple with an ordered list of the model's inputs, and the named inputs from @@ -302,6 +322,15 @@ def export_pytorch( logger.info(f"Using framework PyTorch: {torch.__version__}") output = Path(output) + if stateful: + # Trigger bettertransformer together with stateful model because OpenVINO HW-dependent transformations expect + # both of them are applied to demonstrate the best performance. + # TODO: Consider applying bettertransformer regardless of stateful flag -- requires additional validation. + model = patch_model_with_bettertransformer(model) + # TODO: Consider unpatching model after export is done in the end of this function. + # Now it is left as-is because the model is not expected to be used after call export_pytorch, and + # this function is one of the _internal_ steps in a bigger model conversion pipeline. + with torch.no_grad(): model.config.torchscript = False model.config.return_dict = True @@ -380,6 +409,14 @@ def ts_patched_forward(*args, **kwargs): logger.warning(f"Export model to OpenVINO directly failed with: \n{ex}.\nModel will be exported to ONNX") if patch_model_forward: model.forward = orig_forward + if stateful: + # cannot raise because stateful is enabled by default and it would break backward compatibility for models that couldn't convert to OV directly + # TODO: Implement stateful for ONNX path as well, not doing it right now because of lack of validation + logger.warn( + "[ WARNING ] Making stateful models is not supported when exporting to ONNX as an intermediate step. " + "A stateless model will be exported instead. It may result in sub-optimal inference performance." + "Provide a model that can be converted to OpenVINO without fallback to ONNX conversion path." + ) return export_pytorch_via_onnx( model, config, @@ -411,6 +448,10 @@ def ts_patched_forward(*args, **kwargs): inp_tensor.get_node().set_partial_shape(static_shape) inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype)) ov_model.validate_nodes_and_infer_types() + + if stateful: + patch_stateful(model.config, ov_model) + _save_model(ov_model, output, compression_option=compression_option, compression_ratio=compression_ratio) clear_class_registry() del model @@ -430,6 +471,7 @@ def export_models( model_kwargs: Optional[Dict[str, Any]] = None, compression_option: Optional[str] = None, compression_ratio: Optional[int] = None, + stateful: bool = True, ) -> Tuple[List[List[str]], List[List[str]]]: """ Export the models to OpenVINO IR format @@ -451,6 +493,8 @@ def export_models( Compression ratio between primary and backup precision (only relevant to INT4). model_kwargs (Optional[Dict[str, Any]], optional): Additional kwargs for model export. + stateful (`bool`, defaults to `True`) + Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models. Raises: ValueError: if custom names set not equal of number of models @@ -481,6 +525,7 @@ def export_models( model_kwargs=model_kwargs, compression_option=compression_option, compression_ratio=compression_ratio, + stateful=stateful, ) ) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py new file mode 100644 index 0000000000..37106eacf8 --- /dev/null +++ b/optimum/exporters/openvino/model_patcher.py @@ -0,0 +1,39 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging as log + +from optimum.intel.utils.import_utils import is_torch_version + + +def patch_model_with_bettertransformer(model): + if is_torch_version("<", "2.0"): + log.warn( + "integration Scaled Dot Product Attention optimization supported only with torch > 2.0." + "Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention" + "It is recommended to upgrade PyTorch version for using stateful model or use stateful=False" + ) + # model already has required SDPA implementation + if getattr(model, "_supports_sdpa", False) and getattr(model.config, "_attn_implementation", "eager") == "sdpa": + return model + try: + model = model.to_bettertransformer() + except Exception as e: + log.warn( + f"Cannot apply model.to_bettertransformer because of the exception:\n{e}." + " Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention" + ) + return model + + return model diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py new file mode 100644 index 0000000000..e6ec1879a5 --- /dev/null +++ b/optimum/exporters/openvino/stateful.py @@ -0,0 +1,225 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging as log +from typing import List + +import numpy as np +from transformers import PretrainedConfig + +import openvino as ov +from openvino.runtime import opset13 +from optimum.exporters import TasksManager +from optimum.intel.utils.import_utils import _openvino_version, is_openvino_version +from optimum.utils.normalized_config import NormalizedConfigManager + + +def model_has_state(ov_model: ov.Model): + # TODO: Provide a better way based on the variables availability, but OV Python API doesn't expose required methods + return len(ov_model.get_sinks()) > 0 + + +def model_has_input_output_name(ov_model: ov.Model, name: str): + """ + Helper function for checking that model has specified input or output name + + Parameters: + ov_model (ov.Model): # TODO: Can we derive the dimensions from the model topology? + name (str): + name of input or output + + Returns: + True if input or output with requested name exists else False + """ + return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], []) + + +def fuse_cache_reorder( + ov_model: ov.Model, not_kv_inputs: List[str], key_value_input_names: List[str], gather_dim: int +): + """ + Fuses reored_cache during generate cycle into ov.Model. Used with stateful models, because we can not modify model state directly. + + Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model. + Should be run before make_stateful. Implements optimumum's _reorder_cache + inside the model in the beginning of each iteration. + Gather works along given gather_dim dimension that may vary from model to model. + KV-cache inputs are identified based on names in key_value_input_names. + Append the new beam_idx parameter to not_kv_inputs. + + Parameters: + ov_model (`ov.Model`): + openvino model for processing + not_kv_inputs (`List[str]`): + list of input nodes in model that not related to past key values + key_value_input_names (`List[str]`): + list of names for key value input layers + gather_dim (int): + dimension for gathering cache during reorder pass + """ + + if model_has_input_output_name(ov_model, "beam_idx"): + raise ValueError("Model already has fused cache") + input_batch = ov_model.input("input_ids").get_partial_shape()[0] + beam_idx = opset13.parameter(name="beam_idx", dtype=ov.Type.i32, shape=ov.PartialShape([input_batch])) + beam_idx.output(0).get_tensor().add_names({"beam_idx"}) # why list is not accepted? + ov_model.add_parameters([beam_idx]) + not_kv_inputs.append(ov_model.inputs[-1]) + # Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx + for input_name in key_value_input_names: + parameter_output_port = ov_model.input(input_name) + consumers = parameter_output_port.get_target_inputs() + gather = opset13.gather(parameter_output_port, beam_idx, opset13.constant(gather_dim)) + for consumer in consumers: + consumer.replace_source_output(gather.output(0)) + ov_model.validate_nodes_and_infer_types() + + +def build_state_initializer(ov_model: ov.Model, batch_dim: int): + """ + Build initialization ShapeOf Expression for all ReadValue ops + + Parameters: + ov_model (ov.Model): + openvino model + batch_dim (int): + index of dimension corresponding to batch size + """ + input_ids = ov_model.input("input_ids") + batch = opset13.gather(opset13.shape_of(input_ids, output_type="i64"), opset13.constant([0]), opset13.constant(0)) + for op in ov_model.get_ops(): + if op.get_type_name() == "ReadValue": + dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))] + dims[batch_dim] = batch + dims = [opset13.constant(np.array([dim], dtype=np.int64)) if isinstance(dim, int) else dim for dim in dims] + shape = opset13.concat(dims, axis=0) + broadcast = opset13.broadcast(opset13.constant(0.0, dtype=op.get_output_element_type(0)), shape) + op.set_arguments([broadcast]) + ov_model.validate_nodes_and_infer_types() + + +def make_stateful( + ov_model: ov.Model, + not_kv_inputs: List[str], + key_value_input_names: List[str], + key_value_output_names: List[str], + batch_dim: int, + num_attention_heads: int, + num_beams_and_batch: int = None, +): + """ + Hides kv-cache inputs and outputs inside the model as variables. + + Parameters: + ov_model (ov.Model): + openvino model + not_kv_inputs (`List[str]`): + list of input nodes in model that not related to past key values + key_value_input_names (`List[str]`): + list of names for key value input layers + key_value_output_names (`List[str]`): + list of names for key value input layers + batch_dim (int): + index of batch dimension in key value layers + num_attention_heads (int): + number of attention heads for batch dimension initialization + num_beams_an_batch (int): + precalculated number of beams and batch for shapes initialization + """ + from openvino._offline_transformations import apply_make_stateful_transformation + + input_output_map = {} + # TODO: Can we derive the dimensions from the model topology? + + if num_beams_and_batch is not None: + # Set batch size for input_ids and attention mask to avoid dynamic dimension got propagated from the end of the model back to ReadValue + for input in not_kv_inputs: + shape = input.get_partial_shape() + if shape.rank.get_length() <= 2: # == 1 for beam_index + shape[0] = num_beams_and_batch + input.get_node().set_partial_shape(shape) + else: + log.warn(f"Rank of {input.get_any_name()} input of the model is not 2, batch size is not set") + + for kv_name_pair in zip(key_value_input_names, key_value_output_names): + input_output_map[kv_name_pair[0]] = kv_name_pair[1] + if num_beams_and_batch is not None: + input = ov_model.input(kv_name_pair[0]) + shape = input.get_partial_shape() + shape[batch_dim] = num_beams_and_batch * num_attention_heads + input.get_node().set_partial_shape(shape) + + if num_beams_and_batch is not None: + # Re-validation model if shapes are altered above + ov_model.validate_nodes_and_infer_types() + + apply_make_stateful_transformation(ov_model, input_output_map) + if num_beams_and_batch is None: + build_state_initializer(ov_model, batch_dim) + + +def ensure_stateful_is_available(warn=True): + """ + Check openvino version and raise error if it does not support stateful models + """ + if is_openvino_version("<", "2023.3"): + if warn: + log.warn( + f"Could not create or use stateful model when using old version of openvino=={_openvino_version}. It may result in sub-optimal inference performance." + "Install openvino>=2023.3.0." + ) + return False + return True + + +def ensure_export_task_support_stateful(task: str): + task = TasksManager.map_from_synonym(task) + return task == "text-generation-with-past" + + +def patch_stateful(config: PretrainedConfig, ov_model: ov.Model): + """ + Apply stateful transformation to model to hide key values inputs inside model. + Select transformation parameters based on model architecture + + Parameters: + config (`PretrainedConfig`): + model pretrained config + ov_model (`ov.Model`): + openvino model + """ + + key_value_input_names = [ + key.get_any_name() for key in ov_model.inputs if any("key_values" in key_name for key_name in key.get_names()) + ] + key_value_output_names = [ + key.get_any_name() for key in ov_model.outputs if any("present" in key_name for key_name in key.get_names()) + ] + not_kv_inputs = [ + input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names()) + ] + if not key_value_input_names or not key_value_output_names: + return + + # By default, batch is the 0-th but chatglm uses 1-st dimension as batch + # TODO: Deduce from a model via ordinal reshape (?) and topology + batch_dim = 1 if config.model_type == "chatglm" else 0 + + fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim) + + normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) + num_attention_heads = normalized_config.num_attention_heads if config.model_type == "bloom" else 1 + make_stateful( + ov_model, not_kv_inputs, key_value_input_names, key_value_output_names, batch_dim, num_attention_heads, None + ) diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py index 8cea5eb7b6..6d0af462cc 100644 --- a/optimum/intel/openvino/modeling.py +++ b/optimum/intel/openvino/modeling.py @@ -554,7 +554,7 @@ def from_pretrained( model = TimmForImageClassification.from_pretrained(model_id, **kwargs) onnx_config = TimmOnnxConfig(model.config) - return cls._to_load(model=model, config=config, onnx_config=onnx_config) + return cls._to_load(model=model, config=config, onnx_config=onnx_config, stateful=False) else: return super().from_pretrained( model_id=model_id, diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 32b6b02377..05dc3af9b5 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -315,6 +315,7 @@ def _to_load( force_download: bool = False, cache_dir: Optional[str] = None, local_files_only: bool = False, + stateful: bool = False, **kwargs, ): save_dir = TemporaryDirectory() @@ -326,6 +327,7 @@ def _to_load( config=onnx_config, opset=onnx_config.DEFAULT_ONNX_OPSET, output=save_dir_path / OV_XML_FILE_NAME, + stateful=stateful, ) return cls._from_pretrained( diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 14f8dbcafa..8a2167eae4 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -29,7 +29,8 @@ from optimum.utils import NormalizedConfigManager -from ...exporters.openvino import main_export +from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful +from ...exporters.openvino.stateful import model_has_state from ..utils.import_utils import is_transformers_version from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel @@ -125,7 +126,10 @@ def __init__( self.is_dynamic = dynamic_shapes use_cache = kwargs.pop("use_cache", True) - self.use_cache = any("past_key_values" in key.get_any_name() for key in model.inputs) + model_has_sinks = model_has_state(self.model) + self.use_cache = any("past_key_values" in key.get_any_name() for key in model.inputs) or model_has_sinks + stateful = kwargs.pop("stateful", None) # stateful model only if it is converted with stateful=True + self.stateful = model_has_sinks self.main_input_name = "input_ids" self.num_pkv = 2 self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) @@ -133,22 +137,50 @@ def __init__( self.key_value_output_names = [key for key in self.output_names if "present" in key] self._original_model = self.model.clone() # keep original model for serialization self._pkv_precision = Type.f32 + self.next_beam_idx = None self.update_pkv_precision() if self.is_dynamic: self.model = self._reshape(self.model, -1, -1) - if enable_compilation: - self.compile() + is_stateful_supported = ensure_stateful_is_available(warn=False) - if use_cache ^ self.use_cache: + if self.use_cache and not self.stateful: + logger.warn( + "Provided model does not contain state. It may lead to sub-optimal performance." + "Please reexport model with updated OpenVINO version >= 2023.3.0 calling the `from_pretrained` method with original model " + "and `export=True` parameter" + ) + + if self.stateful: + if stateful is None: + stateful = is_stateful_supported + if model_has_sinks and not is_stateful_supported: + raise ValueError( + "Loaded stateful model, while OpenVINO runtime version does not support stateful model inference. " + "Please update OpenVINO version >= 2023.3.0 " + "or export the original model once again with `stateful=False` when calling the `from_pretrained` method." + "To export your model, simply set `export=True`." + ) + + def raise_error(model_prop, user_prop, name): raise ValueError( - f"`use_cache` was set to `{use_cache}` but the loaded model only supports `use_cache={self.use_cache}`. " - f"Please load your current model with `use_cache={self.use_cache}` or export the original model " - f"once again with `use_cache={use_cache}` when calling the `from_pretrained` method. " + f"`{name}` was set to `{user_prop}` but the loaded model only supports `{name}={model_prop}`. " + f"Please load your current model with `{name}={model_prop}` or export the original model " + f"once again with `{name}={user_prop}` when calling the `from_pretrained` method. " "To export your model, simply set `export=True`." ) + if stateful is not None and stateful ^ self.stateful: + # We cannot transform stateful model to stateless + raise_error(self.stateful, stateful, "stateful") + + if use_cache ^ self.use_cache: + raise_error(self.use_cache, use_cache, "use_cache") + + if enable_compilation: + self.compile() + def update_pkv_precision(self, force_fp32=False): - if not self.use_cache: + if not self.use_cache or self.stateful: return pkv_precision = Type.f32 @@ -231,6 +263,7 @@ def _from_transformers( compression_option = None if load_in_8bit is not None: compression_option = "int8" if load_in_8bit else "fp32" + stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache) main_export( model_name_or_path=model_id, output=save_dir_path, @@ -243,13 +276,14 @@ def _from_transformers( force_download=force_download, trust_remote_code=trust_remote_code, compression_option=compression_option, + stateful=stateful, ) config.is_decoder = True config.is_encoder_decoder = False config.save_pretrained(save_dir_path) return cls._from_pretrained( - model_id=save_dir_path, config=config, use_cache=use_cache, load_in_8bit=False, **kwargs + model_id=save_dir_path, config=config, use_cache=use_cache, load_in_8bit=False, stateful=None, **kwargs ) def _reshape( @@ -276,6 +310,8 @@ def _reshape( shapes[inputs][1] = -1 else: shapes[inputs][2] = -1 + elif input_name.startswith("beam_idx"): + shapes[inputs][0] = -1 else: shapes[inputs][1] = -1 model.reshape(shapes) @@ -290,6 +326,10 @@ def compile(self): super().compile() self.request = self.request.create_infer_request() + def _make_stateful(self): + patch_stateful(self.config, self.model) + self.stateful = True + @add_start_docstrings( """ @@ -319,49 +359,64 @@ def forward( **kwargs, ) -> CausalLMOutputWithPast: self.compile() - inputs = {} - if self.use_cache and past_key_values is not None: input_ids = input_ids[:, -1:] + batch_size = input_ids.shape[0] + if self.config.model_type == "bloom": + batch_size *= self.normalized_config.num_attention_heads + inputs = {} past_len = 0 - if past_key_values is not None: - if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: - past_len = past_key_values[0][1].shape[-2] - if self._pkv_precision == Type.bf16: - # numpy does not support bf16, pretending f16, should change to bf16 - past_key_values = tuple( - Tensor(past_key_value, past_key_value.shape, Type.bf16) - for pkv_per_layer in past_key_values - for past_key_value in pkv_per_layer - ) - else: - # Flatten the past_key_values - past_key_values = tuple( - past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer - ) - else: - past_len = past_key_values[0].shape[-2] - - # Add the past_key_values to the decoder inputs - inputs = dict(zip(self.key_value_input_names, past_key_values)) - - # Create empty past_key_values for decoder_with_past first generation step - elif self.use_cache: - batch_size = input_ids.shape[0] - if self.config.model_type == "bloom": - batch_size *= self.normalized_config.num_attention_heads - - for input_name in self.key_value_input_names: - model_inputs = self.model.input(input_name) - shape = model_inputs.get_partial_shape() - shape[0] = batch_size - if shape[2].is_dynamic: - shape[2] = 0 + if not self.stateful: + if past_key_values is not None: + if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: + past_len = past_key_values[0][1].shape[-2] + if self._pkv_precision == Type.bf16: + # numpy does not support bf16, pretending f16, should change to bf16 + past_key_values = tuple( + Tensor(past_key_value, past_key_value.shape, Type.bf16) + for pkv_per_layer in past_key_values + for past_key_value in pkv_per_layer + ) + else: + # Flatten the past_key_values + past_key_values = tuple( + past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer + ) else: - shape[1] = 0 - inputs[input_name] = Tensor(model_inputs.get_element_type(), shape.get_shape()) + past_len = past_key_values[0].shape[-2] + + # Add the past_key_values to the decoder inputs + inputs = dict(zip(self.key_value_input_names, past_key_values)) + + # Create empty past_key_values for decoder_with_past first generation step + elif self.use_cache: + for input_name in self.key_value_input_names: + model_inputs = self.model.input(input_name) + shape = model_inputs.get_partial_shape() + if self.config.model_type == "chatglm": + shape[0] = 0 + shape[1] = batch_size + else: + shape[0] = batch_size + if shape[2].is_dynamic: + shape[2] = 0 + else: + shape[1] = 0 + inputs[input_name] = Tensor(model_inputs.get_element_type(), shape.get_shape()) + else: + # past_key_values are not used explicitly, instead they are handled inside the model + if past_key_values is None: + # Need a marker to differentiate the first generate iteration from the others in + # the first condition at the function beginning above. + # It should be something that is not None and it should be True when converted to Boolean. + past_key_values = ((),) + # This is the first iteration in a sequence, reset all states + self.request.reset_state() + # Set initial value for the next beam_idx input that will be used at the current iteration + # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used + self.next_beam_idx = np.arange(batch_size, dtype=int) inputs["input_ids"] = np.array(input_ids) # Add the attention_mask inputs when needed @@ -387,21 +442,27 @@ def forward( inputs["position_ids"] = position_ids + if "beam_idx" in self.input_names: + inputs["beam_idx"] = ( + self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int) + ) + # Run inference self.request.start_async(inputs, share_inputs=True) self.request.wait() logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) - if self.use_cache: - # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) - past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) - if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: - # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) - past_key_values = tuple( - past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv) - ) - else: - past_key_values = None + if not self.stateful: + if self.use_cache: + # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) + past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) + if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: + # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) + past_key_values = tuple( + past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv) + ) + else: + past_key_values = None return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) @@ -428,18 +489,23 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg } # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache - @staticmethod def _reorder_cache( - past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor + self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: """ This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct beam_idx at every generation step. """ - return tuple( - tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values - ) + if self.stateful: + # TODO: Apply it differently based on model type + # TODO: At least for bloom we need to replicate values for each attention head + self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration + return past_key_values + else: + return tuple( + tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values + ) def can_generate(self): """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate.""" @@ -500,7 +566,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg use_cache = kwargs.get("use_cache", None) # only last token for input_ids if past is not None - if past_key_values: + if past_key_values and not self.stateful: # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed if past_key_values[0][0].shape[0] == input_ids.shape[0]: past_key_values = self._convert_to_bloom_cache(past_key_values) @@ -522,15 +588,23 @@ def _reorder_cache( [`~PreTrainedModel.beam_sample`] is called for bloom architecture. This is required to match `past_key_values` with the correct beam_idx at every generation step. """ - standardized_past = self._convert_to_standard_cache(past_key_values, batch_size=len(beam_idx)) - reordered_past = tuple( - ( - np.take(layer_past[0], beam_idx, 0), - np.take(layer_past[1], beam_idx, 0), + if self.stateful: + beam_idx = np.array(beam_idx) + batch_size = beam_idx.shape[0] + indices = np.array(range(batch_size * self.normalized_config.num_attention_heads)) + indices = indices.reshape([batch_size, self.normalized_config.num_attention_heads]) + self.next_beam_idx = np.take(indices, beam_idx, 0).flatten() + return past_key_values + else: + standardized_past = self._convert_to_standard_cache(past_key_values, batch_size=len(beam_idx)) + reordered_past = tuple( + ( + np.take(layer_past[0], beam_idx, 0), + np.take(layer_past[1], beam_idx, 0), + ) + for layer_past in standardized_past ) - for layer_past in standardized_past - ) - return self._convert_to_bloom_cache(reordered_past) + return self._convert_to_bloom_cache(reordered_past) # Copied from transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._convert_to_bloom_cache @staticmethod @@ -602,8 +676,11 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg class OVGPTBigCodeForCausalLM(OVModelForCausalLM): # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM._reorder_cache - @staticmethod def _reorder_cache( - past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor + self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: - return tuple(np.take(layer_past, beam_idx, 0) for layer_past in past_key_values) + if self.stateful: + self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration + return past_key_values + else: + return tuple(np.take(layer_past, beam_idx, 0) for layer_past in past_key_values) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index afa5ff81dd..63fac8df6d 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -38,6 +38,7 @@ from optimum.quantization_base import OptimumQuantizer from ...exporters.openvino import export, export_pytorch_via_onnx +from ...exporters.openvino.stateful import ensure_export_task_support_stateful from ..utils.constant import _TASK_ALIASES from .configuration import OVConfig from .modeling_base import OVBaseModel @@ -313,9 +314,11 @@ def start_async( inputs: Any = None, userdata: Any = None, share_inputs: bool = False, + *, + shared_memory: Any = None, ): data_cache.append(inputs) - self.request.infer(inputs, share_inputs) + self.request.infer(inputs, share_inputs, share_outputs=True, shared_memory=shared_memory) def wait(self): pass @@ -415,6 +418,8 @@ def _quantize_torchmodel( onnx_config = onnx_config_class( model.config, use_past=model.config.use_cache, use_past_in_inputs=model.config.use_cache ) + if model.config.use_cache: + task = "text-generation-with-past" else: onnx_config = onnx_config_class(model.config) @@ -423,7 +428,10 @@ def _quantize_torchmodel( export_fn = export if not quantization_config.save_onnx_model else export_pytorch_via_onnx opset = min(onnx_config.DEFAULT_ONNX_OPSET, MAX_ONNX_OPSET) opset = max(opset, MIN_ONNX_QDQ_OPSET) - _, _, is_onnx = export_fn(model=model, config=onnx_config, output=model_path, opset=opset) + kwargs = {} + if not quantization_config.save_onnx_model: + kwargs = {"stateful": ensure_export_task_support_stateful(task)} + _, _, is_onnx = export_fn(model=model, config=onnx_config, output=model_path, opset=opset, **kwargs) if is_onnx: # Load and save the compressed model model = core.read_model(onnx_path) diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index f778bbfcbd..3f3fa6c55b 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -70,12 +70,17 @@ _openvino_version = "N/A" if _openvino_available: try: - _openvino_version = importlib_metadata.version("openvino") - except importlib_metadata.PackageNotFoundError: - try: - _openvino_version = importlib_metadata.version("openvino-nightly") - except importlib_metadata.PackageNotFoundError: - _openvino_available = False + from openvino.runtime import get_version + + version = get_version() + # avoid invalid format + if "-" in version: + major_version, dev_info = version.split("-", 1) + commit_id = dev_info.split("-")[0] + version = f"{major_version}-{commit_id}" + _openvino_version = version + except ImportError: + _openvino_available = False _nncf_available = importlib.util.find_spec("nncf") is not None diff --git a/setup.py b/setup.py index 5c83182476..d85e02dfe1 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ "onnxruntime<1.15.0", "transformers>=4.34.0", ], - "openvino": ["openvino>=2023.2", "onnx", "onnxruntime", "transformers>=4.34.0"], + "openvino": ["openvino>=2023.2", "onnx", "onnxruntime", "transformers>=4.36.0", "optimum>=1.16.1"], "nncf": ["nncf>=2.7.0"], "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"], "diffusers": ["diffusers"], diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index dc33b39f2a..334329cdd2 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -73,6 +73,7 @@ from optimum.intel.openvino import OV_DECODER_NAME, OV_DECODER_WITH_PAST_NAME, OV_ENCODER_NAME, OV_XML_FILE_NAME from optimum.intel.openvino.modeling_seq2seq import OVDecoder, OVEncoder from optimum.intel.openvino.modeling_timm import TimmImageProcessor +from optimum.intel.utils.import_utils import is_openvino_version from optimum.utils import ( DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, DIFFUSION_MODEL_UNET_SUBFOLDER, @@ -487,6 +488,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "pegasus", ) GENERATION_LENGTH = 100 + IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): @@ -494,6 +496,8 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True) self.assertIsInstance(ov_model.config, PretrainedConfig) + self.assertTrue(ov_model.use_cache) + self.assertEqual(ov_model.stateful, self.IS_SUPPORT_STATEFUL and model_arch != "gpt_bigcode") transformers_model = AutoModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) tokens = tokenizer( @@ -507,6 +511,10 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) + self.assertTrue("past_key_values" in ov_outputs) + self.assertIsInstance(ov_outputs.past_key_values, tuple) + if self.IS_SUPPORT_STATEFUL and model_arch != "gpt_bigcode": + self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) with torch.no_grad(): transformers_outputs = transformers_model(**tokens) # Compare tensor outputs @@ -562,8 +570,7 @@ def test_compare_with_and_without_past_key_values(self): model_id = MODEL_NAMES["gpt2"] tokenizer = AutoTokenizer.from_pretrained(model_id) tokens = tokenizer("This is a sample input", return_tensors="pt") - - model_with_pkv = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True) + model_with_pkv = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True, stateful=False) outputs_model_with_pkv = model_with_pkv.generate( **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 ) @@ -574,6 +581,12 @@ def test_compare_with_and_without_past_key_values(self): self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) + if self.IS_SUPPORT_STATEFUL: + model_stateful = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True, stateful=True) + outputs_model_stateful = model_stateful.generate( + **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 + ) + self.assertTrue(torch.equal(outputs_model_without_pkv, outputs_model_stateful)) del model_with_pkv del model_without_pkv @@ -600,7 +613,7 @@ def test_default_filling_attention_mask(self): attention_mask = tokens.pop("attention_mask") outs_without_attn_mask = model_with_cache(**tokens) self.assertTrue(torch.allclose(outs.logits, outs_without_attn_mask.logits)) - input_ids = torch.argmax(outs.logits, dim=2) + input_ids = torch.argmax(outs.logits[:, -1:, :], dim=2) past_key_values = outs.past_key_values attention_mask = torch.ones((input_ids.shape[0], tokens.input_ids.shape[1] + 1), dtype=torch.long) outs_step2 = model_with_cache( diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index a08da51aab..d6da6a78ba 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -51,6 +51,7 @@ from optimum.intel.openvino.configuration import INT8_WEIGHT_COMPRESSION_CONFIG +from optimum.intel.utils.import_utils import is_openvino_version from utils_tests import MODEL_NAMES, get_num_quantized_nodes, _ARCHITECTURES_TO_EXPECTED_INT8 _TASK_TO_DATASET = { @@ -166,6 +167,8 @@ class OVWeightCompressionTest(unittest.TestCase): (OVStableDiffusionXLPipeline, "stable-diffusion-xl"), ) + IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): task = model_cls.export_feature @@ -218,7 +221,7 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i with tempfile.TemporaryDirectory() as tmp_dir: model_id = MODEL_NAMES[model_name] - transformers_model = model_cls.from_pretrained(model_id, export=True) + transformers_model = model_cls.from_pretrained(model_id, export=True, stateful=False) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -239,9 +242,43 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i outputs = model(**tokens) self.assertTrue("logits" in outputs) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS) + @unittest.skipIf(not IS_SUPPORT_STATEFUL, "Stateful models supported only in 2023.3 and above") + def test_ovmodel_4bit_weight_compression_stateful(self, model_cls, model_name, expected_int8, expected_int4): + task = model_cls.export_feature + + with tempfile.TemporaryDirectory() as tmp_dir: + model_id = MODEL_NAMES[model_name] + transformers_model = model_cls.from_pretrained(model_id, export=True, stateful=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) + quantizer.quantize( + save_directory=tmp_dir, + weights_only=True, + quantization_config=OVConfig(compression={"type": "int4_sym_g128", "ratio": 0.8}), + ) + model = model_cls.from_pretrained(tmp_dir) + self.assertTrue(model.stateful) + self.assertTrue(model.use_cache) + + _, num_int8, num_int4 = get_num_quantized_nodes(model) + self.assertEqual(expected_int8, num_int8) + self.assertEqual(expected_int4, num_int4) + + tokens = tokenizer("This is a sample input", return_tensors="pt") + outputs = model(**tokens) + + self.assertTrue("logits" in outputs) + self.assertTrue("past_key_values" in outputs) + self.assertIsInstance(outputs.past_key_values, tuple) + self.assertTrue(len(outputs.past_key_values) == 1 and len(outputs.past_key_values[0]) == 0) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): - model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True) + model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True, stateful=False) if model.export_feature.startswith("text2text-generation"): models = [model.encoder, model.decoder, model.decoder_with_past] @@ -256,6 +293,17 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): _, num_int8, _ = get_num_quantized_nodes(model) self.assertEqual(expected_ov_int8[i], num_int8) + @parameterized.expand((OVModelForCausalLM, "gpt2")) + @unittest.skipIf(not IS_SUPPORT_STATEFUL, "Stateful models supported only in 2023.3 and above") + def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_type): + model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True, stateful=True) + self.assertTrue(model.stateful) + self.assertTrue(model.use_cache) + + expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type][0] + _, num_int8, _ = get_num_quantized_nodes(model) + self.assertEqual(expected_ov_int8, num_int8) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type): model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=False)