Skip to content

Commit

Permalink
Fix general code style and appliy renaming suggestions
Browse files Browse the repository at this point in the history
  • Loading branch information
eaidova committed Dec 21, 2023
1 parent aa299e6 commit 6b0236b
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 65 deletions.
4 changes: 2 additions & 2 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
optional_group.add_argument(
"--stateful",
action="store_true",
help="Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs"
help="Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs",
)


Expand Down Expand Up @@ -143,7 +143,7 @@ def run(self):
trust_remote_code=self.args.trust_remote_code,
pad_token_id=self.args.pad_token_id,
compression_option=self.args.weight_format,
compression_ratio=self.args.ratio
compression_ratio=self.args.ratio,
stateful=self.args.stateful,
# **input_shapes,
)
15 changes: 9 additions & 6 deletions optimum/exporters/openvino/better_transformer_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,29 @@
# limitations under the License.


import torch
import types

import torch


def patch_model_with_bettertransformer(model, model_config):
try:
model = model.to_bettertransformer()
except Exception as e:
print(f'[ WARNING ] Cannot apply model.to_bettertransformer because of the exception:\n{e}')
print(f"[ WARNING ] Cannot apply model.to_bettertransformer because of the exception:\n{e}")
return model

# for better transformers we need sequence lenght to be not 1 to make a correct trace
# patch generate_dummy_inputs in the config

def pathed_generate_dummy_inputs(self, *args, **kwargs):
dummy_inputs = self._original_generate_dummy_inputs(*args, **kwargs)
if 'input_ids' in dummy_inputs and dummy_inputs['input_ids'].shape[1] == 1:
dummy_inputs['input_ids'] = torch.cat([dummy_inputs['input_ids'], dummy_inputs['input_ids']], dim=-1)
attention_mask = dummy_inputs['attention_mask']
dummy_inputs['attention_mask'] = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
if "input_ids" in dummy_inputs and dummy_inputs["input_ids"].shape[1] == 1:
dummy_inputs["input_ids"] = torch.cat([dummy_inputs["input_ids"], dummy_inputs["input_ids"]], dim=-1)
attention_mask = dummy_inputs["attention_mask"]
dummy_inputs["attention_mask"] = torch.cat(
[attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
)
return dummy_inputs

model_config._original_generate_dummy_inputs = model_config.generate_dummy_inputs
Expand Down
13 changes: 7 additions & 6 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@
from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx
from optimum.exporters.onnx.model_patcher import DecoderModelPatcher
from optimum.utils import is_diffusers_available
from .stateful import patch_stateful, raise_if_openvino_is_too_old
from .better_transformer_patch import patch_model_with_bettertransformer

from ...intel.utils.import_utils import is_nncf_available, is_optimum_version
from .better_transformer_patch import patch_model_with_bettertransformer
from .stateful import patch_stateful, raise_if_openvino_is_too_old
from .utils import (
OV_XML_FILE_NAME,
clear_class_registry,
Expand Down Expand Up @@ -396,8 +396,9 @@ def ts_patched_forward(*args, **kwargs):
model.forward = orig_forward
if stateful:
raise ValueError(
'Making stateful models is not supported when exporting to ONNX as an intermediate step. '
'Set stateful=False, or provide a model that can be converted to OpenVINO without fallback to ONNX conversion path.')
"Making stateful models is not supported when exporting to ONNX as an intermediate step. "
"Set stateful=False, or provide a model that can be converted to OpenVINO without fallback to ONNX conversion path."
)
return export_pytorch_via_onnx(
model,
config,
Expand Down Expand Up @@ -432,8 +433,8 @@ def ts_patched_forward(*args, **kwargs):

if stateful:
# Patching model according to stateful parameters
model.key_value_input_names = [name for name in input_names if name.startswith('past_key_values.')]
model.key_value_output_names = [name for name in output_names if name.startswith('present.')]
model.key_value_input_names = [name for name in input_names if name.startswith("past_key_values.")]
model.key_value_output_names = [name for name in output_names if name.startswith("present.")]
patch_stateful(model, ov_model)

_save_model(ov_model, output, compression_option=compression_option, compression_ratio=compression_ratio)
Expand Down
92 changes: 49 additions & 43 deletions optimum/exporters/openvino/stateful.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,23 @@


import numpy as np
from packaging import version

import openvino as ov
from openvino.runtime import opset13
from optimum.intel.utils.import_utils import is_openvino_version
from optimum.utils.normalized_config import NormalizedConfigManager


def model_has_name(ov_model: ov.Model, name: str):
return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], list())
def model_has_input_output_name(ov_model: ov.Model, name: str):
return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], [])


def model_has_input(ov_model: ov.Model, name: str):
return name in sum([list(t.get_names()) for t in ov_model.inputs], list())
return name in sum([list(t.get_names()) for t in ov_model.inputs], [])


def model_has_cache_reorder(ov_model):
return model_has_input(ov_model, 'beam_idx')
return model_has_input(ov_model, "beam_idx")


def model_has_state(ov_model):
Expand All @@ -39,18 +39,18 @@ def model_has_state(ov_model):


def fuse_cache_reorder(ov_model: ov.Model, not_kv_inputs, key_value_input_names, gather_dim: int):
""" Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model.
Should be run before make_stateful. Implements optimumum's _reorder_cache
inside the model in the beginning of each iteration.
Gather works along given gather_dim dimension that may vary from model to model.
KV-cache inputs are identified based on names in key_value_input_names.
Append the new beam_idx parameter to not_kv_inputs.
"""Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model.
Should be run before make_stateful. Implements optimumum's _reorder_cache
inside the model in the beginning of each iteration.
Gather works along given gather_dim dimension that may vary from model to model.
KV-cache inputs are identified based on names in key_value_input_names.
Append the new beam_idx parameter to not_kv_inputs.
"""

assert not model_has_name(ov_model, 'beam_idx')
input_batch = ov_model.input('input_ids').get_partial_shape()[0]
beam_idx = opset13.parameter(name='beam_idx', dtype=ov.Type.i32, shape=ov.PartialShape([input_batch]))
beam_idx.output(0).get_tensor().add_names({'beam_idx'}) # why list is not accepted?
assert not model_has_input_output_name(ov_model, "beam_idx")
input_batch = ov_model.input("input_ids").get_partial_shape()[0]
beam_idx = opset13.parameter(name="beam_idx", dtype=ov.Type.i32, shape=ov.PartialShape([input_batch]))
beam_idx.output(0).get_tensor().add_names({"beam_idx"}) # why list is not accepted?
ov_model.add_parameters([beam_idx])
not_kv_inputs.append(ov_model.inputs[-1])
# Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx
Expand All @@ -65,29 +65,29 @@ def fuse_cache_reorder(ov_model: ov.Model, not_kv_inputs, key_value_input_names,

def build_state_initializer(ov_model: ov.Model, batch_dim):
"""Build initialization ShapeOf Expression for all ReadValue ops"""
input_ids = ov_model.input('input_ids')
batch = opset13.gather(opset13.shape_of(input_ids, output_type='i64'), opset13.constant([0]), opset13.constant(0))
input_ids = ov_model.input("input_ids")
batch = opset13.gather(opset13.shape_of(input_ids, output_type="i64"), opset13.constant([0]), opset13.constant(0))
for op in ov_model.get_ops():
if op.get_type_name() == 'ReadValue':
if op.get_type_name() == "ReadValue":
dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))]
dims[batch_dim] = batch
dims = [opset13.constant(np.array([dim], dtype=np.int64)) if type(dim) is int else dim for dim in dims]
dims = [opset13.constant(np.array([dim], dtype=np.int64)) if isinstance(dim, int) else dim for dim in dims]
shape = opset13.concat(dims, axis=0)
broadcast = opset13.broadcast(opset13.constant(0.0, dtype=op.get_output_element_type(0)), shape)
op.set_arguments([broadcast])
ov_model.validate_nodes_and_infer_types()


def make_stateful(
ov_model: ov.Model,
not_kv_inputs,
key_value_input_names,
key_value_output_names,
batch_dim,
num_attention_heads,
num_beams_and_batch=None):
""" Hides kv-cache inputs and outputs inside the model as variables.
"""
ov_model: ov.Model,
not_kv_inputs,
key_value_input_names,
key_value_output_names,
batch_dim,
num_attention_heads,
num_beams_and_batch=None,
):
"""Hides kv-cache inputs and outputs inside the model as variables."""
from openvino._offline_transformations import apply_make_stateful_transformation

input_output_map = {}
Expand All @@ -101,7 +101,7 @@ def make_stateful(
shape[0] = num_beams_and_batch
input.get_node().set_partial_shape(shape)
else:
print(f'[ WARNING ] Rank of {input.get_any_name()} input of the model is not 2, batch size is not set')
print(f"[ WARNING ] Rank of {input.get_any_name()} input of the model is not 2, batch size is not set")

for kv_name_pair in zip(key_value_input_names, key_value_output_names):
input_output_map[kv_name_pair[0]] = kv_name_pair[1]
Expand All @@ -122,27 +122,33 @@ def make_stateful(

def raise_if_openvino_is_too_old():
if is_openvino_version("<=", "2023.2"):
raise ValueError(f'Could not create or use stateful model when using old version of openvino=={ov.__version__}. Install openvino>=2023.3.0.')
raise ValueError(
f"Could not create or use stateful model when using old version of openvino=={ov.__version__}. Install openvino>=2023.3.0."
)


def patch_stateful(model, ov_model):
def patch_stateful(config, ov_model):
raise_if_openvino_is_too_old()
not_kv_inputs = [input for input in ov_model.inputs if not any(name in model.key_value_input_names for name in input.get_names())]

key_value_input_names = [
key.get_any_name() for key in ov_model.inputs if any("key_values" in key_name for key_name in key.names)
]
key_value_output_names = [
key.get_any_name() for key in ov_model.output if any("present" in key_name for key_name in key.names)
]
not_kv_inputs = [
input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())
]

# By default, batch is the 0-th but chatglm uses 1-st dimension as batch
# TODO: Deduce from a model via ordinal reshape (?) and topology
batch_dim = 1 if model.config.model_type == 'chatglm' else 0
batch_dim = 1 if config.model_type == "chatglm" else 0

fuse_cache_reorder(ov_model, not_kv_inputs, model.key_value_input_names, batch_dim)
fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim)

normalized_config = NormalizedConfigManager.get_normalized_config_class(model.config.model_type)(model.config)
num_attention_heads = normalized_config.num_attention_heads if model.config.model_type == 'bloom' else 1
normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
num_attention_heads = normalized_config.num_attention_heads if config.model_type == "bloom" else 1

make_stateful(
ov_model,
not_kv_inputs,
model.key_value_input_names,
model.key_value_output_names,
batch_dim,
num_attention_heads,
None)
ov_model, not_kv_inputs, key_value_input_names, key_value_output_names, batch_dim, num_attention_heads, None
)
14 changes: 6 additions & 8 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,11 @@

from optimum.utils import NormalizedConfigManager

from ...exporters.openvino import main_export
from ...exporters.openvino import main_export, patch_stateful, raise_if_openvino_is_too_old
from ..utils.import_utils import is_transformers_version
from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS
from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
from ...exporters.openvino import patch_stateful, raise_if_openvino_is_too_old


if is_transformers_version("<", "4.25.0"):
Expand Down Expand Up @@ -164,8 +163,7 @@ def raise_error(model_prop, user_prop, name):
self.compile()

if use_cache ^ self.use_cache:
raise_error(self.use_cache, use_cache, 'use_cache')

raise_error(self.use_cache, use_cache, "use_cache")

def update_pkv_precision(self, force_fp32=False):
if not self.use_cache or self.stateful:
Expand Down Expand Up @@ -310,7 +308,7 @@ def compile(self):
self.request = self.request.create_infer_request()

def _make_stateful(self):
patch_stateful(self, self.model)
patch_stateful(self.config, self.model)
self.stateful = True


Expand Down Expand Up @@ -379,7 +377,7 @@ def forward(
for input_name in self.key_value_input_names:
model_inputs = self.model.input(input_name)
shape = model_inputs.get_partial_shape()
if self.config.model_type == 'chatglm':
if self.config.model_type == "chatglm":
shape[0] = 0
shape[1] = batch_size
else:
Expand Down Expand Up @@ -427,8 +425,8 @@ def forward(

inputs["position_ids"] = position_ids

if hasattr(self, 'next_beam_idx'):
inputs['beam_idx'] = self.next_beam_idx
if hasattr(self, "next_beam_idx"):
inputs["beam_idx"] = self.next_beam_idx

# Run inference
self.request.start_async(inputs, share_inputs=True)
Expand Down

0 comments on commit 6b0236b

Please sign in to comment.