huggingface · slyalin · Dec 5, 2023 · Dec 7, 2023 · Dec 7, 2023 · Dec 8, 2023
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -70,6 +70,11 @@ def parse_args_openvino(parser: "ArgumentParser"):
     )
     optional_group.add_argument("--fp16", action="store_true", help="Compress weights to fp16"),
     optional_group.add_argument("--int8", action="store_true", help="Compress weights to int8"),
+    optional_group.add_argument(
+        "--stateful",
+        action="store_true",
+        help="Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs"
+    ),
 
 
 class OVExportCommand(BaseOptimumCLICommand):
@@ -106,5 +111,6 @@ def run(self):
             pad_token_id=self.args.pad_token_id,
             fp16=self.args.fp16,
             int8=self.args.int8,
+            stateful=self.args.stateful,
             # **input_shapes,
         )
diff --git a/optimum/exporters/openvino/__init__.py b/optimum/exporters/openvino/__init__.py
@@ -1,5 +1,6 @@
 from .__main__ import main_export
 from .convert import export, export_models, export_pytorch_via_onnx
+from .stateful import patch_stateful, raise_if_openvino_is_too_old
 
 
 __all__ = ["main_export", "export", "export_models"]
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -56,6 +56,7 @@ def main_export(
     custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None,
     fn_get_submodels: Optional[Callable] = None,
     int8: Optional[bool] = None,
+    stateful: Optional[bool] = None,
     **kwargs_shapes,
 ):
     """
@@ -350,6 +351,7 @@ class StoreAttr(object):
         device=device,
         fp16=fp16,
         int8=int8,
+        stateful=stateful,
         model_kwargs=model_kwargs,
     )
 

diff --git a/optimum/exporters/openvino/better_transformer_patch.py b/optimum/exporters/openvino/better_transformer_patch.py
@@ -0,0 +1,41 @@
+#  Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+import torch
+import types
+
+
+def patch_model_with_bettertransformer(model, model_config):
+    try:
+        model = model.to_bettertransformer()
+    except Exception as e:
+        print(f'[ WARNING ] Cannot apply model.to_bettertransformer because of the exception:\n{e}')
+        return model
+
+    # for better transformers we need sequence lenght to be not 1 to make a correct trace
+    # patch generate_dummy_inputs in the config
+
+    def pathed_generate_dummy_inputs(self, *args, **kwargs):
+        dummy_inputs = self._original_generate_dummy_inputs(*args, **kwargs)
+        if 'input_ids' in dummy_inputs and dummy_inputs['input_ids'].shape[1] == 1:
+            dummy_inputs['input_ids'] = torch.cat([dummy_inputs['input_ids'], dummy_inputs['input_ids']], dim=-1)
+            attention_mask = dummy_inputs['attention_mask']
+            dummy_inputs['attention_mask'] = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+        return dummy_inputs
+
+    model_config._original_generate_dummy_inputs = model_config.generate_dummy_inputs
+    model_config.generate_dummy_inputs = types.MethodType(pathed_generate_dummy_inputs, model_config)
+
+    return model
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -30,6 +30,8 @@
 from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx
 from optimum.exporters.onnx.model_patcher import DecoderModelPatcher
 from optimum.utils import is_diffusers_available
+from .stateful import patch_stateful, raise_if_openvino_is_too_old
+from .better_transformer_patch import patch_model_with_bettertransformer
 
 from ...intel.utils.import_utils import is_nncf_available, is_optimum_version
 from .utils import (
@@ -77,6 +79,7 @@ def export(
     model_kwargs: Optional[Dict[str, Any]] = None,
     fp16: bool = False,
     int8: bool = False,
+    stateful: bool = False,
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a Pytorch or TensorFlow model to an OpenVINO Intermediate Representation.
@@ -120,6 +123,7 @@ def export(
             model_kwargs=model_kwargs,
             fp16=fp16,
             int8=int8,
+            stateful=stateful,
         )
 
     elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
@@ -232,6 +236,7 @@ def export_pytorch(
     model_kwargs: Optional[Dict[str, Any]] = None,
     fp16: bool = False,
     int8: bool = False,
+    stateful: bool = False,
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a PyTorch model to an OpenVINO Intermediate Representation.
@@ -263,6 +268,15 @@ def export_pytorch(
     logger.info(f"Using framework PyTorch: {torch.__version__}")
     output = Path(output)
 
+    if stateful:
+        # Trigger bettertransformer together with stateful model because OpenVINO HW-dependent transformations expect
+        # both of them are applied to demonstrate the best performance.
+        # TODO: Consider applying bettertransformer regardless of stateful flag -- requires additional validation.
+        model = patch_model_with_bettertransformer(model, config)
+        # TODO: Consider unpatching model after export is done in the end of this function.
+        #       Now it is left as-is because the model is not expected to be used after call export_pytorch, and
+        #       this function is one of the _internal_ steps in a bigger model conversion pipeline.
+
     with torch.no_grad():
         model.config.torchscript = False
         model.config.return_dict = True
@@ -341,6 +355,10 @@ def ts_patched_forward(*args, **kwargs):
             logger.warning(f"Export model to OpenVINO directly failed with: \n{ex}.\nModel will be exported to ONNX")
             if patch_model_forward:
                 model.forward = orig_forward
+            if stateful:
+                raise ValueError(
+                    'Making stateful models is not supported when exporting to ONNX as an intermediate step. '
+                    'Set stateful=False, or provide a model that can be converted to OpenVINO without fallback to ONNX conversion path.')
             return export_pytorch_via_onnx(
                 model, config, opset, output, device, input_shapes, model_kwargs, fp16=fp16, int8=int8
             )
@@ -364,6 +382,13 @@ def ts_patched_forward(*args, **kwargs):
             inp_tensor.get_node().set_partial_shape(static_shape)
             inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
         ov_model.validate_nodes_and_infer_types()
+
+        if stateful:
+            # Patching model according to stateful parameters
+            model.key_value_input_names = [name for name in input_names if name.startswith('past_key_values.')]
+            model.key_value_output_names = [name for name in output_names if name.startswith('present.')]
+            patch_stateful(model, ov_model)
+
         _save_model(ov_model, output, compress_to_fp16=fp16, load_in_8bit=int8)
         clear_class_registry()
         del model
@@ -383,6 +408,7 @@ def export_models(
     model_kwargs: Optional[Dict[str, Any]] = None,
     fp16: bool = False,
     int8: bool = False,
+    stateful: bool = False,
 ) -> Tuple[List[List[str]], List[List[str]]]:
     """
     Export the models to OpenVINO IR format
@@ -406,6 +432,9 @@ def export_models(
     Returns:
         list of input_names and output_names from ONNX configuration
     """
+    if stateful:
+        # This will be checked anyway after the model conversion, but checking it earlier will save time for a user if not suitable version is used
+        raise_if_openvino_is_too_old()
     outputs = []
 
     if output_names is not None and len(output_names) != len(models_and_onnx_configs):
@@ -429,6 +458,7 @@ def export_models(
                 model_kwargs=model_kwargs,
                 fp16=fp16,
                 int8=int8,
+                stateful=stateful,
             )
         )
 

diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py
@@ -0,0 +1,146 @@
+#  Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+import numpy as np
+from packaging import version
+import openvino as ov
+from openvino.runtime import opset13
+from optimum.intel.utils.import_utils import is_openvino_version
+
+
+def model_has_name(ov_model: ov.Model, name: str):
-def model_has_name(ov_model: ov.Model, name: str):
+def model_has_input_output_name(ov_model: ov.Model, name: str):
-def model_has_name(ov_model: ov.Model, name: str):
+def model_has_input_output_name(ov_model: ov.Model, name: str):
+    return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], list())
+
+
+def model_has_input(ov_model: ov.Model, name: str):
+    return name in sum([list(t.get_names()) for t in ov_model.inputs], list())
+
+
+def model_has_cache_reorder(ov_model):
+    return model_has_input(ov_model, 'beam_idx')
+
+
+def model_has_state(ov_model):
-def model_has_state(ov_model):
+def _model_has_state(ov_model):
-def model_has_state(ov_model):
+def _model_has_state(ov_model):
+    # TODO: Provide a better way based on the variables availability, but OV Python API doesn't expose required methods
+    return len(ov_model.get_sinks()) > 0
+
+
+def fuse_cache_reorder(ov_model: ov.Model, not_kv_inputs, key_value_input_names, gather_dim: int):
+    """ Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model.
+        Should be run before make_stateful. Implements optimumum's _reorder_cache
+        inside the model in the beginning of each iteration.
+        Gather works along given gather_dim dimension that may vary from model to model.
+        KV-cache inputs are identified based on names in key_value_input_names.
+        Append the new beam_idx parameter to not_kv_inputs.
+    """
+
+    assert not model_has_name(ov_model, 'beam_idx')
+    input_batch = ov_model.input('input_ids').get_partial_shape()[0]
+    beam_idx = opset13.parameter(name='beam_idx', dtype=ov.Type.i32, shape=ov.PartialShape([input_batch]))
+    beam_idx.output(0).get_tensor().add_names({'beam_idx'})  # why list is not accepted?
+    ov_model.add_parameters([beam_idx])
+    not_kv_inputs.append(ov_model.inputs[-1])
+    # Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx
+    for input_name in key_value_input_names:
+        parameter_output_port = ov_model.input(input_name)
+        consumers = parameter_output_port.get_target_inputs()
+        gather = opset13.gather(parameter_output_port, beam_idx, opset13.constant(gather_dim))
+        for consumer in consumers:
+            consumer.replace_source_output(gather.output(0))
+    ov_model.validate_nodes_and_infer_types()
+
+
+def build_state_initializer(ov_model: ov.Model, batch_dim):
+    """Build initialization ShapeOf Expression for all ReadValue ops"""
+    input_ids = ov_model.input('input_ids')
+    batch = opset13.gather(opset13.shape_of(input_ids, output_type='i64'), opset13.constant([0]), opset13.constant(0))
+    for op in ov_model.get_ops():
+        if op.get_type_name() == 'ReadValue':
+            dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))]
+            dims[batch_dim] = batch
+            dims = [opset13.constant(np.array([dim], dtype=np.int64)) if type(dim) is int else dim for dim in dims]
+            shape = opset13.concat(dims, axis=0)
+            broadcast = opset13.broadcast(opset13.constant(0.0, dtype=op.get_output_element_type(0)), shape)
+            op.set_arguments([broadcast])
+    ov_model.validate_nodes_and_infer_types()
+
+
+def make_stateful(
+        ov_model: ov.Model,
+        not_kv_inputs,
+        key_value_input_names,
+        key_value_output_names,
+        batch_dim,
+        num_attention_heads,
+        num_beams_and_batch=None):
+    """ Hides kv-cache inputs and outputs inside the model as variables.
+    """
+    from openvino._offline_transformations import apply_make_stateful_transformation
+
+    input_output_map = {}
+    # TODO: Can we derive the dimensions from the model topology?
+
+    if num_beams_and_batch is not None:
+        # Set batch size for input_ids and attention mask to avoid dynamic dimension got propagated from the end of the model back to ReadValue
+        for input in not_kv_inputs:
+            shape = input.get_partial_shape()
+            if shape.rank.get_length() <= 2:  # == 1 for beam_index
+                shape[0] = num_beams_and_batch
+                input.get_node().set_partial_shape(shape)
+            else:
+                print(f'[ WARNING ] Rank of {input.get_any_name()} input of the model is not 2, batch size is not set')
+
+    for kv_name_pair in zip(key_value_input_names, key_value_output_names):
+        input_output_map[kv_name_pair[0]] = kv_name_pair[1]
+        if num_beams_and_batch is not None:
+            input = ov_model.input(kv_name_pair[0])
+            shape = input.get_partial_shape()
+            shape[batch_dim] = num_beams_and_batch * num_attention_heads
+            input.get_node().set_partial_shape(shape)
+
+    if num_beams_and_batch is not None:
+        # Re-validation model if shapes are altered above
+        ov_model.validate_nodes_and_infer_types()
+
+    apply_make_stateful_transformation(ov_model, input_output_map)
+    if num_beams_and_batch is None:
+        build_state_initializer(ov_model, batch_dim)
+
+
+def raise_if_openvino_is_too_old():
+    if is_openvino_version("<=", "2023.2"):
+        raise ValueError(f'Could not create or use stateful model when using old version of openvino=={ov.__version__}. Install openvino>=2023.3.0.')
+
+
+def patch_stateful(model, ov_model):
+    raise_if_openvino_is_too_old()
+    not_kv_inputs = [input for input in ov_model.inputs if not any(name in model.key_value_input_names for name in input.get_names())]
+
+    # By default, batch is the 0-th but chatglm uses 1-st dimension as batch
+    # TODO: Deduce from a model via ordinal reshape (?) and topology
+    batch_dim = 1 if model.config.model_type == 'chatglm' else 0
+
+    fuse_cache_reorder(ov_model, not_kv_inputs, model.key_value_input_names, batch_dim)
+
+    num_attention_heads = model.normalized_config.num_attention_heads if model.config.model_type == 'bloom' else 1
+
+    make_stateful(
+        ov_model,
+        not_kv_inputs,
+        model.key_value_input_names,
+        model.key_value_output_names,
+        batch_dim,
+        num_attention_heads,
+        None)