diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 7e0be486ef..c07e1544f2 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2300,37 +2300,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int return generated_input -class DummyQwen2VLVisionEMbedInputGenerator(DummyVisionInputGenerator): - SUPPORTED_INPUT_NAMES = ("hidden_states",) - - def __init__( - self, - task: str, - normalized_config: NormalizedVisionConfig, - batch_size: int = 1, - num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], - width: int = 420, - height: int = 420, - **kwargs, - ): - self.batch_size = batch_size - self.height = height - self.width = width - self.num_channels = num_channels - self.temporal_patch_size = normalized_config.config.temporal_patch_size - self.patch_size = normalized_config.config.patch_size - - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size - grid_t = self.batch_size - shape = [ - grid_t * grid_h * grid_w, - self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size, - ] - return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) - - -class DummyQwen2VLVisionEmbedMergerInputGenerator(DummyVisionInputGenerator): +class DummyQwen2VLVisionEmbedInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ("hidden_states", "attention_mask", "rotary_pos_emb") def __init__( @@ -2349,7 +2319,10 @@ def __init__( self.num_channels = num_channels self.temporal_patch_size = normalized_config.config.temporal_patch_size self.patch_size = normalized_config.config.patch_size - self.embed_dim = normalized_config.config.embed_dim + if normalized_config.use_embed_dim: + self.embed_dim = normalized_config.config.embed_dim + else: + self.embed_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size self.num_heads = normalized_config.config.num_heads def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): @@ -2382,7 +2355,7 @@ class Qwen2VLConfigBehavior(str, enum.Enum): class Qwen2VLOpenVINOConfig(OnnxConfig): SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior] NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig - DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEMbedInputGenerator,) + DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedInputGenerator,) MIN_TRANSFORMERS_VERSION = version.parse("4.45.0") def __init__( @@ -2405,12 +2378,13 @@ def __init__( self._orig_config = config if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): self._config = config.vision_config + self._config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) - self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEMbedInputGenerator,) + self._normalized_config.use_embed_dim = False if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER and hasattr(config, "vision_config"): self._config = config.vision_config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) - self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedMergerInputGenerator,) + self._normalized_config.use_embed_dim = True @staticmethod def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]): diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 2405b68626..41c879e481 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -29,6 +29,7 @@ from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name from ...exporters.openvino.utils import save_config from .. import OVQuantizer +from ..utils.import_utils import is_transformers_version from .configuration import OVConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel, OVModelPart from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM @@ -2096,13 +2097,13 @@ def __init__( quantization_config=quantization_config, **kwargs, ) - try: + if is_transformers_version(">=", "4.45.0"): from transformers.models.qwen2_vl.modeling_qwen2_vl import VisionRotaryEmbedding self._rotary_pos_emb = VisionRotaryEmbedding( self.config.vision_config.embed_dim // self.config.vision_config.num_heads // 2 ) - except ImportError: + else: raise ValueError( f"Initialization model for {self.config.model_type} required at least transformers >= 4.45" )