From 91095b91b63509c379e26a08cbde8c656a3d7f43 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Mon, 28 Oct 2024 13:16:06 +0400
Subject: [PATCH] extend tests

---
 .../openvino/modeling_visual_language.py      | 26 ++++++++-------
 tests/openvino/test_modeling.py               | 33 +++++++++++++++++++
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index cb72799ea..05f2e9282 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -722,7 +722,8 @@ def __init__(
             quantization_config=quantization_config,
             **kwargs,
         )
-        self._legacy_processing = not hasattr(self.config, "image_seq_length")
+        self._support_new_processing = hasattr(self.config, "image_seq_length")
+        self._legacy_processing = not self._support_new_processing
 
     def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
         if input_ids is not None and input_ids.shape[1] == 1:
@@ -758,9 +759,7 @@ def merge_vision_text_embeddings(
         image_features = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds
         inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
         if legacy_processing is None:
-            legacy_processing = not (hasattr(self.config, "image_seq_length") and (input_ids.shape[-1] == 1)) or (
-                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
-            )
+            legacy_processing = self._legacy_processing
 
         if legacy_processing:
             pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
@@ -840,20 +839,19 @@ def merge_vision_text_embeddings(
     def get_multimodal_embeddings(
         self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, past_key_values=None, **kwargs
     ):
-        legacy_processing = self._legacy_processing
         inputs_embeds = self.get_text_embeddings(input_ids, **kwargs)
 
-        if pixel_values is not None and not legacy_processing and past_key_values is None:
+        if pixel_values is not None and self._support_new_processing and past_key_values is None:
             legacy_processing = (input_ids == self.config.image_token_index).sum(
                 1
             ).max() < self.config.image_seq_length
             self._legacy_processing = legacy_processing
 
         inputs_embeds, attention_mask, position_ids = super().get_multimodal_embeddings(
-            input_ids, pixel_values, attention_mask, position_ids, legacy_processing=legacy_processing, **kwargs
+            input_ids, pixel_values, attention_mask, position_ids, legacy_processing=self._legacy_processing, **kwargs
         )
 
-        if legacy_processing and pixel_values is not None and past_key_values is not None:
+        if self._legacy_processing and pixel_values is not None and past_key_values is not None:
             attention_mask, position_ids = self._filter_unattended_tokens(input_ids, attention_mask, past_key_values)
 
         return inputs_embeds, attention_mask, position_ids
@@ -966,9 +964,8 @@ def get_multimodal_embeddings(
         from transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches
 
         inputs_embeds = self.get_text_embeddings(input_ids, **kwargs)
-        legacy_processing = self._legacy_processing
 
-        if pixel_values is not None and not legacy_processing and past_key_values is None:
+        if pixel_values is not None and self._support_new_processing and past_key_values is None:
             legacy_processing = (input_ids == self.config.image_token_index).sum(
                 1
             ).max() < self.config.image_seq_length
@@ -1010,11 +1007,16 @@ def get_multimodal_embeddings(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
                     position_ids=position_ids,
-                    legacy_processing=legacy_processing,
+                    legacy_processing=self._legacy_processing,
                     **kwargs,
                 )
 
-        if legacy_processing and pixel_values is not None and past_key_values is not None and input_ids.shape[1] == 1:
+        if (
+            self._legacy_processing
+            and pixel_values is not None
+            and past_key_values is not None
+            and input_ids.shape[1] == 1
+        ):
             attention_mask, position_ids = self._filter_unattended_tokens(input_ids, attention_mask, past_key_values)
 
         return inputs_embeds, attention_mask, position_ids
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 169701e4a..e66407a2c 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -1984,6 +1984,39 @@ def test_compare_to_transformers(self, model_arch):
             f"generation config : {gen_config}, transformers output {transformers_outputs}, ov_model output {ov_outputs}",
         )
 
+        # previous run was with legacy processing, one more run with features concatenation on preprocessing level
+        if (
+            model_arch in ["llava", "llava-next"]
+            and is_transformers_version(">=", "4.45")
+            and (processor.patch_size is None or processor.vision_feature_select_strategy is None)
+        ):
+            processor.patch_size = ov_model.config.vision_config.patch_size
+            processor.vision_feature_select_strategy = ov_model.config.vision_feature_select_strategy
+            if model_arch == "llava":
+                # testing model for llava does ot have specified image_seq_length and it is different from default
+                transformers_model.config.image_seq_length = 225
+                ov_model.config.image_seq_length = 225
+            self.assertTrue(processor.patch_size is not None)
+            self.assertTrue(processor.vision_feature_select_strategy is not None)
+            inputs = processor(images=self.IMAGE, text=prompt, return_tensors="pt")
+            self.assertTrue(
+                (inputs.input_ids == ov_model.config.image_token_index).sum(1).max()
+                >= ov_model.config.image_seq_length
+            )
+            set_seed(SEED)
+            with torch.no_grad():
+                transformers_outputs = transformers_model(**inputs)
+            set_seed(SEED)
+            ov_outputs = ov_model(**inputs)
+            self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
+            set_seed(SEED)
+            ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
+            set_seed(SEED)
+            transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config)
+            self.assertTrue(
+                torch.equal(ov_outputs, transformers_outputs),
+                f"generation config : {gen_config}, transformers output {transformers_outputs}, ov_model output {ov_outputs}",
+            )
         del transformers_model
         del ov_model