huggingface · echarlaix · Jan 16, 2024 · Dec 5, 2023 · Dec 7, 2023 · Dec 7, 2023
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -92,6 +92,11 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "precision (by default 20% in INT8). This helps to achieve better accuracy after weight quantization."
         ),
     )
+    optional_group.add_argument(
+        "--stateful",
+        action="store_true",
+        help="Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs",
+    )
 
 
 class OVExportCommand(BaseOptimumCLICommand):
@@ -138,6 +143,7 @@ def run(self):
             trust_remote_code=self.args.trust_remote_code,
             pad_token_id=self.args.pad_token_id,
             compression_option=self.args.weight_format,
-            compression_ratio=self.args.ratio
+            compression_ratio=self.args.ratio,
+            stateful=self.args.stateful,
             # **input_shapes,
         )
diff --git a/optimum/exporters/openvino/__init__.py b/optimum/exporters/openvino/__init__.py
@@ -1,5 +1,6 @@
 from .__main__ import main_export
 from .convert import export, export_models, export_pytorch_via_onnx
+from .stateful import patch_stateful, raise_if_openvino_is_too_old
 
 
 __all__ = ["main_export", "export", "export_models"]
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -65,6 +65,7 @@ def main_export(
     fn_get_submodels: Optional[Callable] = None,
     compression_option: Optional[str] = None,
     compression_ratio: Optional[float] = None,
+    stateful: Optional[bool] = None,
     **kwargs_shapes,
 ):
     """
@@ -124,6 +125,7 @@ def main_export(
             `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point, `f32` - means no compression.
         compression_ratio (`Optional[float]`, defaults to `None`):
             Compression ratio between primary and backup precision (only relevant to INT4).
+        stateful (`Optional[bool]`)  - Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs
         **kwargs_shapes (`Dict`):
             Shapes to use during inference. This argument allows to override the default shapes used during the ONNX export.
 
@@ -373,6 +375,7 @@ class StoreAttr(object):
         device=device,
         compression_option=compression_option,
         compression_ratio=compression_ratio,
+        stateful=stateful,
         model_kwargs=model_kwargs,
     )
 

diff --git a/optimum/exporters/openvino/better_transformer_patch.py b/optimum/exporters/openvino/better_transformer_patch.py
@@ -0,0 +1,39 @@
+#  Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import logging as log
+
+from optimum.intel.utils.import_utils import is_torch_version
+
+
+def patch_model_with_bettertransformer(model):
+    if is_torch_version("<", "2.0"):
+        log.warn(
+            "integration Scaled Dot Product Attention optimization supported only with torch > 2.0."
+            "Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention"
+            "It is recommended to upgrade PyTorch version for using stateful model or use stateful=Flase"
+        )
+    # model already has required SDPA implementation
+    if getattr(model, "_supports_sdpa", False) and getattr(model.config, "_attn_implementation", "eager") == "sdpa":
+        return model
+    try:
+        model = model.to_bettertransformer()
+    except Exception as e:
+        log.warn(
+            f"Cannot apply model.to_bettertransformer because of the exception:\n{e}."
+            " Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention"
+        )
+        return model
+
+    return model
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -32,6 +32,8 @@
 from optimum.utils import is_diffusers_available
 
 from ...intel.utils.import_utils import is_nncf_available, is_optimum_version
+from .better_transformer_patch import patch_model_with_bettertransformer
+from .stateful import patch_stateful, raise_if_openvino_is_too_old
 from .utils import (
     OV_XML_FILE_NAME,
     clear_class_registry,
@@ -102,6 +104,7 @@ def export(
     model_kwargs: Optional[Dict[str, Any]] = None,
     compression_option: Optional[str] = None,
     compression_ratio: Optional[float] = None,
+    stateful: bool = False,
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a Pytorch or TensorFlow model to an OpenVINO Intermediate Representation.
@@ -125,6 +128,8 @@ def export(
             Compression ratio between primary and backup precision (only relevant to INT4).
         input_shapes (`Optional[Dict]`, defaults to `None`):
             If specified, allows to use specific shapes for the example input provided to the exporter.
+         stateful (`Optional[bool]`):
+            Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs
 
     Returns:
         `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
@@ -150,6 +155,7 @@ def export(
             compression_option=compression_option,
             compression_ratio=compression_ratio,
             model_kwargs=model_kwargs,
+            stateful=stateful,
         )
 
     elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
@@ -234,6 +240,8 @@ def export_pytorch_via_onnx(
             `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point.
         compression_ratio (`Optional[float]`, defaults to `None`):
             Compression ratio between primary and backup precision (only relevant to INT4).
+        stateful (`Optional[bool]`):
+            Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs
 
     Returns:
         `Tuple[List[str], List[str], bool]`: A tuple with an ordered list of the model's inputs, and the named inputs from
@@ -271,6 +279,7 @@ def export_pytorch(
     model_kwargs: Optional[Dict[str, Any]] = None,
     compression_option: Optional[str] = None,
     compression_ratio: Optional[float] = None,
+    stateful: bool = False,
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a PyTorch model to an OpenVINO Intermediate Representation.
@@ -291,6 +300,13 @@ def export_pytorch(
             If specified, allows to use specific shapes for the example input provided to the exporter.
         model_kwargs (optional[Dict[str, Any]], defaults to `None`):
             Additional kwargs for model export
+        compression_option (`Optional[str]`, defaults to `None`):
+            The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point,
+            `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point.
+        compression_ratio (`Optional[float]`, defaults to `None`):
+            Compression ratio between primary and backup precision (only relevant to INT4).
+        stateful (`Optional[bool]`):
+            Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs
 
     Returns:
         `Tuple[List[str], List[str], bool]`: A tuple with an ordered list of the model's inputs, and the named inputs from
@@ -302,6 +318,15 @@ def export_pytorch(
     logger.info(f"Using framework PyTorch: {torch.__version__}")
     output = Path(output)
 
+    if stateful:
+        # Trigger bettertransformer together with stateful model because OpenVINO HW-dependent transformations expect
+        # both of them are applied to demonstrate the best performance.
+        # TODO: Consider applying bettertransformer regardless of stateful flag -- requires additional validation.
+        model = patch_model_with_bettertransformer(model)
+        # TODO: Consider unpatching model after export is done in the end of this function.
+        #       Now it is left as-is because the model is not expected to be used after call export_pytorch, and
+        #       this function is one of the _internal_ steps in a bigger model conversion pipeline.
+
     with torch.no_grad():
         model.config.torchscript = False
         model.config.return_dict = True
@@ -380,6 +405,11 @@ def ts_patched_forward(*args, **kwargs):
             logger.warning(f"Export model to OpenVINO directly failed with: \n{ex}.\nModel will be exported to ONNX")
             if patch_model_forward:
                 model.forward = orig_forward
+            if stateful:
+                raise ValueError(
+                    "Making stateful models is not supported when exporting to ONNX as an intermediate step. "
+                    "Set stateful=False, or provide a model that can be converted to OpenVINO without fallback to ONNX conversion path."
+                )
             return export_pytorch_via_onnx(
                 model,
                 config,
@@ -411,6 +441,10 @@ def ts_patched_forward(*args, **kwargs):
             inp_tensor.get_node().set_partial_shape(static_shape)
             inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
         ov_model.validate_nodes_and_infer_types()
+
+        if stateful:
+            patch_stateful(model.config, ov_model)
+
         _save_model(ov_model, output, compression_option=compression_option, compression_ratio=compression_ratio)
         clear_class_registry()
         del model
@@ -430,6 +464,7 @@ def export_models(
     model_kwargs: Optional[Dict[str, Any]] = None,
     compression_option: Optional[str] = None,
     compression_ratio: Optional[int] = None,
+    stateful: bool = False,
 ) -> Tuple[List[List[str]], List[List[str]]]:
     """
     Export the models to OpenVINO IR format
@@ -451,13 +486,18 @@ def export_models(
             Compression ratio between primary and backup precision (only relevant to INT4).
         model_kwargs (Optional[Dict[str, Any]], optional):
             Additional kwargs for model export.
+        stateful (`Optional[bool]`)
+            Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs
 
     Raises:
         ValueError: if custom names set not equal of number of models
 
     Returns:
         list of input_names and output_names from ONNX configuration
     """
+    if stateful:
+        # This will be checked anyway after the model conversion, but checking it earlier will save time for a user if not suitable version is used
+        raise_if_openvino_is_too_old()
     outputs = []
 
     if output_names is not None and len(output_names) != len(models_and_onnx_configs):
@@ -481,6 +521,7 @@ def export_models(
                 model_kwargs=model_kwargs,
                 compression_option=compression_option,
                 compression_ratio=compression_ratio,
+                stateful=stateful,
             )
         )