From e57baaca2d0f566c8daa9fa027da07e58fe11436 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Mon, 16 Oct 2023 19:31:34 +0400
Subject: [PATCH 1/3] Add openvino export configs and support chatglm

---
 optimum/exporters/openvino/__init__.py        |   4 +
 optimum/exporters/openvino/__main__.py        | 143 +++++++++++++++---
 optimum/exporters/openvino/base.py            |  25 +++
 .../openvino/dummy_input_generators.py        |  61 ++++++++
 optimum/exporters/openvino/model_configs.py   |  91 +++++++++++
 .../exporters/openvino/normalized_configs.py  |   9 ++
 optimum/intel/openvino/modeling_decoder.py    |  31 +++-
 7 files changed, 337 insertions(+), 27 deletions(-)
 create mode 100644 optimum/exporters/openvino/base.py
 create mode 100644 optimum/exporters/openvino/dummy_input_generators.py
 create mode 100644 optimum/exporters/openvino/model_configs.py
 create mode 100644 optimum/exporters/openvino/normalized_configs.py

diff --git a/optimum/exporters/openvino/__init__.py b/optimum/exporters/openvino/__init__.py
index d87d8dda9e..f21ca7e595 100644
--- a/optimum/exporters/openvino/__init__.py
+++ b/optimum/exporters/openvino/__init__.py
@@ -1,5 +1,9 @@
 from .__main__ import main_export
+from .base import init_model_configs
 from .convert import export, export_models, export_pytorch_via_onnx
+from .model_configs import *
 
 
+init_model_configs()
+
 __all__ = ["main_export", "export", "export_models"]
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 782aa0bc0d..52c72944e8 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -15,14 +15,20 @@
 import logging
 import os
 from pathlib import Path
-from typing import Any, Callable, Dict, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoTokenizer
 
 from optimum.exporters import TasksManager
-from optimum.exporters.onnx import __main__ as optimum_main
 from optimum.exporters.onnx.base import OnnxConfig, OnnxConfigWithPast
+from optimum.exporters.onnx.utils import (
+    _get_submodels_for_export_encoder_decoder,
+    _get_submodels_for_export_stable_diffusion,
+    get_encoder_decoder_models_for_export,
+    get_sam_models_for_export,
+    get_stable_diffusion_models_for_export,
+)
 from optimum.utils import DEFAULT_DUMMY_SHAPES
 from optimum.utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
 
@@ -31,6 +37,10 @@
 from .convert import export_models
 
 
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, TFPreTrainedModel
+
+
 OV_XML_FILE_NAME = "openvino_model.xml"
 
 _MAX_UNCOMPRESSED_SIZE = 1e9
@@ -38,6 +48,102 @@
 logger = logging.getLogger(__name__)
 
 
+def _get_submodels_and_export_configs(
+    model: Union["PreTrainedModel", "TFPreTrainedModel"],
+    task: str,
+    custom_onnx_configs: Dict,
+    custom_architecture: bool,
+    _variant: str,
+    int_dtype: str = "int64",
+    float_dtype: str = "fp32",
+    fn_get_submodels: Optional[Callable] = None,
+    preprocessors: Optional[List[Any]] = None,
+    no_position_ids: bool = False,
+):
+    is_stable_diffusion = "stable-diffusion" in task
+    if not custom_architecture:
+        if is_stable_diffusion:
+            onnx_config = None
+            models_and_onnx_configs = get_stable_diffusion_models_for_export(
+                model, int_dtype=int_dtype, float_dtype=float_dtype
+            )
+        else:
+            onnx_config_constructor = TasksManager.get_exporter_config_constructor(
+                model=model, exporter="openvino", task=task
+            )
+            onnx_config_kwargs = {}
+            if task.startswith("text-generation") and no_position_ids:
+                onnx_config_kwargs["no_position_ids"] = no_position_ids
+
+            onnx_config = onnx_config_constructor(
+                model.config,
+                int_dtype=int_dtype,
+                float_dtype=float_dtype,
+                preprocessors=preprocessors,
+                **onnx_config_kwargs,
+            )
+
+            onnx_config.variant = _variant
+            all_variants = "\n".join(
+                [f"\t- {name}: {description}" for name, description in onnx_config.VARIANTS.items()]
+            )
+            logger.info(f"Using the export variant {onnx_config.variant}. Available variants are:\n{all_variants}")
+
+            if model.config.is_encoder_decoder and task.startswith(TasksManager._ENCODER_DECODER_TASKS):
+                models_and_onnx_configs = get_encoder_decoder_models_for_export(model, onnx_config)
+            elif task.startswith("text-generation"):
+                model = patch_decoder_attention_mask(model)
+                onnx_config_constructor = TasksManager.get_exporter_config_constructor(
+                    model=model, exporter="openvino", task=task
+                )
+                onnx_config = onnx_config_constructor(model.config)
+                models_and_onnx_configs = {"model": (model, onnx_config)}
+            elif model.config.model_type == "sam":
+                models_and_onnx_configs = get_sam_models_for_export(model, onnx_config)
+            else:
+                models_and_onnx_configs = {"model": (model, onnx_config)}
+
+        # When specifying custom ONNX configs for supported transformers architectures, we do
+        # not force to specify a custom ONNX config for each submodel.
+        for key, custom_onnx_config in custom_onnx_configs.items():
+            models_and_onnx_configs[key] = (models_and_onnx_configs[key][0], custom_onnx_config)
+    else:
+        onnx_config = None
+        submodels_for_export = None
+        models_and_onnx_configs = {}
+
+        if fn_get_submodels is not None:
+            submodels_for_export = fn_get_submodels(model)
+        else:
+            if is_stable_diffusion:
+                submodels_for_export = _get_submodels_for_export_stable_diffusion(model)
+            elif model.config.is_encoder_decoder and task.startswith(TasksManager._ENCODER_DECODER_TASKS):
+                submodels_for_export = _get_submodels_for_export_encoder_decoder(
+                    model, use_past=task.endswith("-with-past")
+                )
+            elif task.startswith("text-generation"):
+                model = patch_decoder_attention_mask(model)
+                models_and_onnx_configs = {"model": model}
+            else:
+                submodels_for_export = {"model": model}
+
+        if submodels_for_export.keys() != custom_onnx_configs.keys():
+            logger.error(f"ONNX custom configs for: {', '.join(custom_onnx_configs.keys())}")
+            logger.error(f"Submodels to export: {', '.join(submodels_for_export.keys())}")
+            raise ValueError(
+                "Trying to export a custom model, but could not find as many custom ONNX configs as the number of submodels to export. Please specifiy the fn_get_submodels argument, that should return a dictionary of submodules with as many items as the provided custom_onnx_configs dictionary."
+            )
+
+        for key, custom_onnx_config in custom_onnx_configs.items():
+            models_and_onnx_configs[key] = (submodels_for_export[key], custom_onnx_config)
+
+    # Default to the first ONNX config for stable-diffusion and custom architecture case.
+    if onnx_config is None:
+        onnx_config = next(iter(models_and_onnx_configs.values()))[1]
+
+    return onnx_config, models_and_onnx_configs
+
+
 def main_export(
     model_name_or_path: str,
     output: Union[str, Path],
@@ -183,7 +289,7 @@ def main_export(
                 f"If you want to support {model_type} please propose a PR or open up an issue."
             )
         if model.config.model_type.replace("-", "_") not in TasksManager.get_supported_model_type_for_task(
-            task, exporter="onnx"
+            task, exporter="openvino"
         ):
             custom_architecture = True
 
@@ -200,7 +306,7 @@ def main_export(
     if (
         not custom_architecture
         and not is_stable_diffusion
-        and task + "-with-past" in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx")
+        and task + "-with-past" in TasksManager.get_supported_tasks_for_model_type(model_type, "openvino")
     ):
         if original_task == "auto":  # Make -with-past the default if --task was not explicitely specified
             task = task + "-with-past"
@@ -222,24 +328,15 @@ def main_export(
     preprocessors = maybe_load_preprocessors(
         model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
     )
-    if not task.startswith("text-generation"):
-        onnx_config, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs(
-            model=model,
-            task=task,
-            monolith=False,
-            custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
-            custom_architecture=custom_architecture,
-            fn_get_submodels=fn_get_submodels,
-            preprocessors=preprocessors,
-            _variant="default",
-        )
-    else:
-        # TODO : ModelPatcher will be added in next optimum release
-        model = patch_decoder_attention_mask(model)
-
-        onnx_config_constructor = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task)
-        onnx_config = onnx_config_constructor(model.config)
-        models_and_onnx_configs = {"model": (model, onnx_config)}
+    onnx_config, models_and_onnx_configs = _get_submodels_and_export_configs(
+        model=model,
+        task=task,
+        custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
+        custom_architecture=custom_architecture,
+        fn_get_submodels=fn_get_submodels,
+        preprocessors=preprocessors,
+        _variant="default",
+    )
 
     if int8 is None:
         int8 = False
@@ -276,7 +373,7 @@ def main_export(
         generation_config = getattr(model, "generation_config", None)
         if generation_config is not None:
             generation_config.save_pretrained(output)
-        maybe_save_preprocessors(model_name_or_path, output)
+        maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code)
 
         if model.config.is_encoder_decoder and task.startswith("text-generation"):
             raise ValueError(
diff --git a/optimum/exporters/openvino/base.py b/optimum/exporters/openvino/base.py
new file mode 100644
index 0000000000..2de28432c8
--- /dev/null
+++ b/optimum/exporters/openvino/base.py
@@ -0,0 +1,25 @@
+from copy import deepcopy
+from typing import Callable, Type
+
+from optimum.exporters.tasks import TasksManager
+from optimum.utils.normalized_config import NormalizedConfigManager
+
+
+def init_model_configs():
+    suppored_models = TasksManager._SUPPORTED_MODEL_TYPE
+    for model, export_configs in suppored_models.items():
+        if "onnx" not in export_configs:
+            continue
+        TasksManager._SUPPORTED_MODEL_TYPE[model]["openvino"] = deepcopy(
+            TasksManager._SUPPORTED_MODEL_TYPE[model]["onnx"]
+        )
+
+
+def register_normalized_config(model_type: str) -> Callable[[Type], Type]:
+    def decorator(config_cls: Type) -> Type:
+        if model_type in NormalizedConfigManager._conf:
+            return config_cls
+        NormalizedConfigManager._conf[model_type] = config_cls
+        return config_cls
+
+    return decorator
diff --git a/optimum/exporters/openvino/dummy_input_generators.py b/optimum/exporters/openvino/dummy_input_generators.py
new file mode 100644
index 0000000000..219b7193cf
--- /dev/null
+++ b/optimum/exporters/openvino/dummy_input_generators.py
@@ -0,0 +1,61 @@
+from typing import Optional, Tuple
+
+from optimum.utils import (
+    DEFAULT_DUMMY_SHAPES,
+    DummyPastKeyValuesGenerator,
+    DummyTextInputGenerator,
+    NormalizedTextConfig,
+)
+
+
+class ChatGLN2DummyTextInputGenerator(DummyTextInputGenerator):
+    SUPPORTED_INPUT_NAMES = {
+        "input_ids",
+        "attention_mask",
+        "token_type_ids",
+        "position_ids",
+    }
+
+
+class ChatGLM2DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedTextConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        random_batch_size_range: Optional[Tuple[int, int]] = None,
+        random_sequence_length_range: Optional[Tuple[int, int]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            task=task,
+            normalized_config=normalized_config,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            random_batch_size_range=random_batch_size_range,
+            random_sequence_length_range=random_sequence_length_range,
+        )
+        self.multi_query_group_num = normalized_config.multi_query_group_num
+        self.head_dim = self.hidden_size // self.num_attention_heads
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        past_key_shape = (
+            self.sequence_length,
+            self.batch_size,
+            self.multi_query_group_num,
+            self.head_dim,
+        )
+        past_value_shape = (
+            self.sequence_length,
+            self.batch_size,
+            self.multi_query_group_num,
+            self.head_dim,
+        )
+        return [
+            (
+                self.random_float_tensor(past_key_shape, framework=framework, dtype=float_dtype),
+                self.random_float_tensor(past_value_shape, framework=framework, dtype=float_dtype),
+            )
+            for _ in range(self.num_layers)
+        ]
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
new file mode 100644
index 0000000000..eeec30d75e
--- /dev/null
+++ b/optimum/exporters/openvino/model_configs.py
@@ -0,0 +1,91 @@
+#  Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from typing import Callable, Dict, Type
+
+from optimum.exporters.onnx import TextDecoderOnnxConfig
+from optimum.exporters.tasks import TasksManager, make_backend_config_constructor_for_task
+
+from .dummy_input_generators import ChatGLM2DummyPastKeyValuesGenerator, ChatGLN2DummyTextInputGenerator
+from .normalized_configs import ChatGLM2NormalizedConfig
+
+
+def create_register(overwrite_existing: bool = False):
+    def wrapper(model_type: str, *supported_tasks: str) -> Callable[[Type], Type]:
+        def decorator(config_cls: Type) -> Type:
+            mapping = TasksManager._SUPPORTED_MODEL_TYPE.get(model_type, {})
+            mapping_backend = mapping.get("openvino", {})
+            for task in supported_tasks:
+                normalized_task = task
+                if "-with-past" in task:
+                    normalized_task = task.split("-with-past")[0]
+                if normalized_task not in TasksManager.get_all_tasks():
+                    known_tasks = ", ".join(TasksManager.get_all_tasks())
+                    raise ValueError(
+                        f'The TasksManager does not know the task called "{task}", known tasks: {known_tasks}.'
+                    )
+                if not overwrite_existing and task in mapping_backend:
+                    continue
+                mapping_backend[task] = make_backend_config_constructor_for_task(config_cls, task)
+            mapping["openvino"] = mapping_backend
+            TasksManager._SUPPORTED_MODEL_TYPE[model_type] = mapping
+            return config_cls
+
+        return decorator
+
+    return wrapper
+
+
+register_in_tasks_manager = create_register(True)
+
+
+@register_in_tasks_manager("chatglm", *["text-generation", "text-generation-with-past"])
+class ChatGLM2OpenVINOConfig(TextDecoderOnnxConfig):
+    NORMALIZED_CONFIG_CLASS = ChatGLM2NormalizedConfig
+    DUMMY_INPUT_GENERATOR_CLASSES = (ChatGLN2DummyTextInputGenerator, ChatGLM2DummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = ChatGLM2DummyPastKeyValuesGenerator
+    no_position_ids = False
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        common_inputs = super().inputs
+        common_inputs.pop("attention_mask")
+        if not self.no_position_ids and self.task == "text-generation":
+            common_inputs["position_ids"] = {0: "batch_size", 1: "sequence_length"}
+
+        return common_inputs
+
+    def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str):
+        """
+        Fills `input_or_outputs` mapping with past_key_values dynamic axes considering the direction.
+
+        Args:
+            inputs_or_outputs (`Dict[str, Dict[int, str]]`):
+                The mapping to fill.
+            direction (`str`):
+                either "inputs" or "outputs", it specifies whether `input_or_outputs` is the input mapping or the
+                output mapping, this is important for axes naming.
+        """
+        if direction not in ["inputs", "outputs"]:
+            raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
+
+        if direction == "inputs":
+            decoder_sequence_name = "past_sequence_length"
+            name = "past_key_values"
+        else:
+            decoder_sequence_name = "past_sequence_length + 1"
+            name = "present"
+
+        for i in range(self._normalized_config.num_layers):
+            inputs_or_outputs[f"{name}.{i}.key"] = {1: "batch_size", 0: decoder_sequence_name}
+            inputs_or_outputs[f"{name}.{i}.value"] = {1: "batch_size", 0: decoder_sequence_name}
diff --git a/optimum/exporters/openvino/normalized_configs.py b/optimum/exporters/openvino/normalized_configs.py
new file mode 100644
index 0000000000..c50cf11741
--- /dev/null
+++ b/optimum/exporters/openvino/normalized_configs.py
@@ -0,0 +1,9 @@
+from optimum.utils import NormalizedTextConfig
+
+from .base import register_normalized_config
+
+
+@register_normalized_config("chatglm")
+class ChatGLM2NormalizedConfig(NormalizedTextConfig):
+    NUM_LAYERS = "num_layers"
+    VOCAB_SIZE = "padded_vocab_size"
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 68d737fe74..9e3262ac92 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -16,7 +16,7 @@
 import os
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 
 import numpy as np
 import openvino
@@ -35,6 +35,10 @@
 from .utils import OV_XML_FILE_NAME, STR_TO_OV_TYPE
 
 
+if TYPE_CHECKING:
+    pass
+
+
 if is_transformers_version("<", "4.25.0"):
     from transformers.generation_utils import GenerationMixin
 else:
@@ -269,7 +273,9 @@ def _reshape(
             shapes[inputs][0] = -1
             input_name = inputs.get_any_name()
             if input_name.startswith("past_key_values"):
-                if len(inputs.partial_shape) == 3 and input_name.endswith("value"):
+                if (
+                    len(inputs.partial_shape) == 3 and input_name.endswith("value")
+                ) or self.config.model_type == "chatglm":
                     shapes[inputs][1] = -1
                 else:
                     shapes[inputs][2] = -1
@@ -312,6 +318,7 @@ def forward(
         input_ids: torch.LongTensor,
         attention_mask: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
         self.compile()
@@ -345,6 +352,11 @@ def forward(
             for input_name in self.key_value_input_names:
                 model_inputs = self.model.input(input_name)
                 shape = model_inputs.get_partial_shape()
+                if self.config.model_type == "chatglm":
+                    shape[0] = 0
+                    shape[1] = shape_input_ids[0] * num_attention_heads
+                    inputs[input_name] = Tensor(model_inputs.get_element_type(), shape.get_shape())
+                    continue
                 shape[0] = shape_input_ids[0] * num_attention_heads
                 if shape[2].is_dynamic:
                     shape[2] = 0
@@ -358,6 +370,8 @@ def forward(
         if "attention_mask" in self.input_names and attention_mask is not None:
             inputs["attention_mask"] = np.array(attention_mask)
 
+        if "position_ids" in self.input_names and position_ids is not None:
+            inputs["position_ids"] = position_ids
         # Run inference
         self.request.start_async(inputs, shared_memory=True)
         self.request.wait()
@@ -385,12 +399,21 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             if past_key_values[0][0].shape[0] == input_ids.shape[0]:
                 past_key_values = self._convert_to_bloom_cache(past_key_values)
 
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
         return {
             "input_ids": input_ids,
             "past_key_values": past_key_values,
             "use_cache": self.use_cache,
-            "position_ids": None,
-            "attention_mask": kwargs.get("attention_mask", None),
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
             "token_type_ids": None,
         }
 

From fae7802afe9afb5d4a947564ef8ec6780e38608e Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Thu, 19 Oct 2023 10:20:00 +0400
Subject: [PATCH 2/3] copyrights and code fixes

---
 optimum/exporters/openvino/__init__.py             | 14 ++++++++++++++
 optimum/exporters/openvino/__main__.py             |  2 ++
 optimum/exporters/openvino/base.py                 | 13 +++++++++++++
 .../exporters/openvino/dummy_input_generators.py   | 14 ++++++++++++++
 optimum/exporters/openvino/normalized_configs.py   | 14 ++++++++++++++
 optimum/intel/openvino/modeling_decoder.py         |  6 +-----
 6 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/optimum/exporters/openvino/__init__.py b/optimum/exporters/openvino/__init__.py
index f21ca7e595..cd9ed271e0 100644
--- a/optimum/exporters/openvino/__init__.py
+++ b/optimum/exporters/openvino/__init__.py
@@ -1,3 +1,17 @@
+#  Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 from .__main__ import main_export
 from .base import init_model_configs
 from .convert import export, export_models, export_pytorch_via_onnx
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 52c72944e8..65ecc90b53 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -97,6 +97,8 @@ def _get_submodels_and_export_configs(
                     model=model, exporter="openvino", task=task
                 )
                 onnx_config = onnx_config_constructor(model.config)
+                if onnx_config.use_past:
+                    onnx_config.use_past_in_inputs = True
                 models_and_onnx_configs = {"model": (model, onnx_config)}
             elif model.config.model_type == "sam":
                 models_and_onnx_configs = get_sam_models_for_export(model, onnx_config)
diff --git a/optimum/exporters/openvino/base.py b/optimum/exporters/openvino/base.py
index 2de28432c8..edd8025ab1 100644
--- a/optimum/exporters/openvino/base.py
+++ b/optimum/exporters/openvino/base.py
@@ -1,3 +1,16 @@
+#  Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
 from copy import deepcopy
 from typing import Callable, Type
 
diff --git a/optimum/exporters/openvino/dummy_input_generators.py b/optimum/exporters/openvino/dummy_input_generators.py
index 219b7193cf..25439eb432 100644
--- a/optimum/exporters/openvino/dummy_input_generators.py
+++ b/optimum/exporters/openvino/dummy_input_generators.py
@@ -1,3 +1,17 @@
+#  Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 from typing import Optional, Tuple
 
 from optimum.utils import (
diff --git a/optimum/exporters/openvino/normalized_configs.py b/optimum/exporters/openvino/normalized_configs.py
index c50cf11741..c5f00cff2a 100644
--- a/optimum/exporters/openvino/normalized_configs.py
+++ b/optimum/exporters/openvino/normalized_configs.py
@@ -1,3 +1,17 @@
+#  Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 from optimum.utils import NormalizedTextConfig
 
 from .base import register_normalized_config
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 9e3262ac92..78b2d790bd 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -16,7 +16,7 @@
 import os
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
+from typing import Dict, Optional, Tuple, Union
 
 import numpy as np
 import openvino
@@ -35,10 +35,6 @@
 from .utils import OV_XML_FILE_NAME, STR_TO_OV_TYPE
 
 
-if TYPE_CHECKING:
-    pass
-
-
 if is_transformers_version("<", "4.25.0"):
     from transformers.generation_utils import GenerationMixin
 else:

From e596cc7e3e8709343047d645c94f38f5e9b3bcb3 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Fri, 20 Oct 2023 11:31:13 +0400
Subject: [PATCH 3/3] enable attention mask and fix accuracy issue for chatglm

---
 .../openvino/dummy_input_generators.py        | 12 ++++++
 optimum/exporters/openvino/model_configs.py   |  1 -
 optimum/intel/openvino/modeling_decoder.py    | 38 ++++++++++++++++---
 optimum/intel/utils/modeling_utils.py         | 38 +++++++++++++++++++
 4 files changed, 83 insertions(+), 6 deletions(-)

diff --git a/optimum/exporters/openvino/dummy_input_generators.py b/optimum/exporters/openvino/dummy_input_generators.py
index 25439eb432..31673f45c6 100644
--- a/optimum/exporters/openvino/dummy_input_generators.py
+++ b/optimum/exporters/openvino/dummy_input_generators.py
@@ -14,6 +14,8 @@
 
 from typing import Optional, Tuple
 
+import torch
+
 from optimum.utils import (
     DEFAULT_DUMMY_SHAPES,
     DummyPastKeyValuesGenerator,
@@ -30,6 +32,16 @@ class ChatGLN2DummyTextInputGenerator(DummyTextInputGenerator):
         "position_ids",
     }
 
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        input = super().generate(input_name, framework, int_dtype, float_dtype)
+        if input_name == "attention_mask":
+            input = torch.ones((input.shape[0], input.shape[1] + 1), dtype=input.dtype)
+            # input[0] = 0
+        if input_name == "position_ids":
+            input = torch.range(0, input.shape[1] + 1, dtype=input.dtype).repeat(1, 1)
+            # input[0] = 0
+        return input
+
 
 class ChatGLM2DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
     def __init__(
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index eeec30d75e..fcefbafd58 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -59,7 +59,6 @@ class ChatGLM2OpenVINOConfig(TextDecoderOnnxConfig):
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         common_inputs = super().inputs
-        common_inputs.pop("attention_mask")
         if not self.no_position_ids and self.task == "text-generation":
             common_inputs["position_ids"] = {0: "batch_size", 1: "sequence_length"}
 
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 78b2d790bd..4f0a09ffd9 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -16,7 +16,7 @@
 import os
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 import openvino
@@ -25,7 +25,7 @@
 from openvino.runtime import Core, Tensor, Type
 from transformers import AutoModelForCausalLM, PretrainedConfig
 from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
-from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
 
 from optimum.utils import NormalizedConfigManager
 
@@ -401,9 +401,8 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
+        if past_key_values:
+            position_ids = position_ids[:, -1].unsqueeze(-1)
         return {
             "input_ids": input_ids,
             "past_key_values": past_key_values,
@@ -413,6 +412,35 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             "token_type_ids": None,
         }
 
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id += 1
+            model_kwargs["position_ids"] = torch.cat([position_ids, new_position_id], dim=-1)
+
+        model_kwargs["is_first_forward"] = False
+        return model_kwargs
+
     def _reorder_cache(
         self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
     ) -> Tuple[Tuple[torch.Tensor]]:
diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index 17abf1059e..5e94e94d06 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import types
 from typing import Tuple
 
 import torch
@@ -92,6 +93,40 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds,
     return combined_attention_mask
 
 
+@torch.jit.script_if_tracing
+def _chatglm2_get_context_layer(query_layer: torch.Tensor, key_layer: torch.Tensor, value_layer: torch.Tensor):
+    mask = torch.zeros((query_layer.shape[-2], key_layer.shape[-2]), dtype=query_layer.dtype)
+    if query_layer.shape[2] == key_layer.shape[2]:
+        tmp_mask = torch.ones((query_layer.shape[-2], key_layer.shape[-2]), dtype=torch.bool).triu(diagonal=1)
+        mask.masked_fill_(tmp_mask, float("-inf"))
+
+    context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, attn_mask=mask)
+    return context_layer
+
+
+def _core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask):
+    query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
+    if attention_mask is None:
+        context_layer = _chatglm2_get_context_layer(query_layer, key_layer, value_layer)
+    else:
+        attention_mask = ~attention_mask
+        context_layer = torch.nn.functional.scaled_dot_product_attention(
+            query_layer, key_layer, value_layer, attention_mask
+        )
+    context_layer = context_layer.permute(2, 0, 1, 3)
+    new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+    context_layer = context_layer.reshape(*new_context_layer_shape)
+
+    return context_layer
+
+
+def _patch_chatglm_core_attention_forward(model: "PreTrainedModel"):
+    for block in model.transformer.encoder.layers:
+        block.self_attention.core_attention.forward = types.MethodType(
+            _core_attention_forward, block.self_attention.core_attention
+        )
+
+
 def patch_decoder_attention_mask(model: "PreTrainedModel"):
     """
     Apply patch on decoder with past model forward to resolve first inference based on model architecture
@@ -108,4 +143,7 @@ def patch_decoder_attention_mask(model: "PreTrainedModel"):
         model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
     elif model.config.model_type in {"blenderbot-small", "blenderbot", "opt", "pegasus", "bart"}:
         model.model.decoder._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
+    elif model.config.model_type == "chatglm":
+        _patch_chatglm_core_attention_forward(model)
+
     return model