From ee7d80ffec6d3fe2d87425ee069cf21b09e35012 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 10 Dec 2024 14:30:22 +0100
Subject: [PATCH 1/6] Fix vlm compression

---
 optimum/commands/export/openvino.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 5e951aa438..54570d3bfe 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -361,7 +361,7 @@ def run(self):
             model.save_pretrained(self.args.output)
             if not self.args.disable_convert_tokenizer:
                 maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
-        elif (task.startswith("text-generation") or task == "image-text-to-text") and quantize_with_dataset:
+        elif task.startswith("text-generation") and quantize_with_dataset or task == "image-text-to-text":
             if task.startswith("text-generation"):
                 from optimum.intel import OVModelForCausalLM
 
@@ -371,7 +371,7 @@ def run(self):
 
                 model_cls = OVModelForVisualCausalLM
 
-            # To quantize a model with a dataset, an instance of a model class is required
+            # In this case, to apply quantization an instance of a model class is required
             model = model_cls.from_pretrained(
                 self.args.model,
                 export=True,

From 27358a22773d38f6856551909d489ef853a1dd88 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 10 Dec 2024 14:30:51 +0100
Subject: [PATCH 2/6] Extend compression tests to check submodel weights
 precision

---
 tests/openvino/test_exporters_cli.py | 101 ++++++++++++++++++++------
 tests/openvino/test_quantization.py  | 102 ++++++++++++++++++---------
 tests/openvino/utils_tests.py        |  18 +++++
 3 files changed, 165 insertions(+), 56 deletions(-)

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index f94d0f4b5d..4fd2627b60 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -14,12 +14,14 @@
 import subprocess
 import unittest
 from pathlib import Path
+from typing import Dict, List
 
 from parameterized import parameterized
 from transformers import AutoModelForCausalLM
 from utils_tests import (
     _ARCHITECTURES_TO_EXPECTED_INT8,
     MODEL_NAMES,
+    compare_num_quantized_nodes_per_model,
     get_num_quantized_nodes,
 )
 
@@ -107,37 +109,47 @@ class OVCLIExportTestCase(unittest.TestCase):
         SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("stable-diffusion-3", 9, 65))
 
     TEST_4BIT_CONFIGURATIONS = [
-        ("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", {"int8": 4, "int4": 72}),
-        ("text-generation-with-past", "opt125m", "int4 --group-size 64", {"int8": 4, "int4": 144}),
-        ("text-generation-with-past", "opt125m", "mxfp4", {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}),
-        ("text-generation-with-past", "opt125m", "nf4", {"int8": 4, "nf4": 72}),
-        ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 8 --all-layers", {"int4": 16}),
+        ("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", [{"int8": 4, "int4": 72}]),
+        ("text-generation-with-past", "opt125m", "int4 --group-size 64", [{"int8": 4, "int4": 144}]),
+        ("text-generation-with-past", "opt125m", "mxfp4", [{"int8": 4, "f4e2m1": 72, "f8e8m0": 72}]),
+        ("text-generation-with-past", "opt125m", "nf4", [{"int8": 4, "nf4": 72}]),
+        (
+            "text-generation-with-past",
+            "llama_awq",
+            "int4 --ratio 1.0 --sym --group-size 8 --all-layers",
+            [{"int4": 16}],
+        ),
         (
             "text-generation-with-past",
             "llama_awq",
             "int4 --ratio 1.0 --sym --group-size 16 --awq --dataset wikitext2 --num-samples 100 "
             "--sensitivity-metric max_activation_variance",
-            {"int8": 4, "int4": 14},
+            [{"int8": 4, "int4": 14}],
         ),
         (
             "text-generation-with-past",
             "llama_awq",
             "int4 --ratio 1.0 --sym --group-size 16 --scale-estimation --dataset wikitext2 --num-samples 100 ",
-            {"int8": 4, "int4": 14},
+            [{"int8": 4, "int4": 14}],
         ),
         (
             "text-generation-with-past",
             "llama_awq",
             "int4 --ratio 1.0 --sym --group-size 16 --gptq --dataset wikitext2 --num-samples 100 ",
-            {"int8": 4, "int4": 14},
+            [{"int8": 4, "int4": 14}],
         ),
         (
             "text-generation-with-past",
             "llama_awq",
             "int4 --ratio 1.0 --sym --group-size 16 --lora-correction --dataset auto --num-samples 16",
-            {"int8": 60, "int4": 14},
+            [{"int8": 60, "int4": 14}],
+        ),
+        (
+            "text-generation-with-past",
+            "llama_awq",
+            "int4 --group-size 16 --backup-precision none --ratio 0.5",
+            [{"int4": 12}],
         ),
-        ("text-generation-with-past", "llama_awq", "int4 --group-size 16 --backup-precision none", {"int4": 28}),
     ]
 
     if is_transformers_version(">=", "4.40.0"):
@@ -146,16 +158,28 @@ class OVCLIExportTestCase(unittest.TestCase):
                 (
                     "image-text-to-text",
                     "llava_next",
-                    'int4 --group-size 16 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" '
+                    "int4 --group-size 16 --ratio 0.8",
+                    [{"int8": 14, "int4": 16}, {"int8": 9}, {"int8": 1}],
+                ),
+                (
+                    "image-text-to-text",
+                    "llava_next",
+                    'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "hessian_input_activation" '
                     "--dataset contextual --num-samples 1",
-                    {"int8": 8, "int4": 22},
+                    [{"int8": 6, "int4": 24}, {"int8": 9}, {"int8": 1}],
+                ),
+                (
+                    "image-text-to-text",
+                    "nanollava",
+                    "int4 --group-size 8 --ratio 0.8 --trust-remote-code",
+                    [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
                 ),
                 (
                     "image-text-to-text",
                     "nanollava",
-                    'int4 --group-size 8 --ratio 0.9 --sensitivity-metric "mean_activation_variance" '
+                    'int4 --group-size 8 --ratio 0.8 --sensitivity-metric "mean_activation_variance" '
                     "--dataset contextual --num-samples 1 --trust-remote-code",
-                    {"int8": 12, "int4": 18},
+                    [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
                 ),
             ]
         )
@@ -163,19 +187,44 @@ class OVCLIExportTestCase(unittest.TestCase):
     if is_transformers_version(">=", "4.45.0"):
         TEST_4BIT_CONFIGURATIONS.extend(
             [
+                (
+                    "image-text-to-text",
+                    "minicpmv",
+                    "int4 --group-size 4 --ratio 0.8 --trust-remote-code",
+                    [{"int8": 10, "int4": 20}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
+                ),
+                (
+                    "image-text-to-text",
+                    "minicpmv",
+                    'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
+                    "--dataset contextual --num-samples 1 --trust-remote-code",
+                    [{"int8": 8, "int4": 22}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
+                ),
+                (
+                    "image-text-to-text",
+                    "internvl2",
+                    "int4 --group-size 4 --ratio 0.8 --trust-remote-code",
+                    [{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
+                ),
                 (
                     "image-text-to-text",
                     "internvl2",
-                    'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "hessian_input_activation" '
+                    'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
                     "--dataset contextual --num-samples 1 --trust-remote-code",
-                    {"int8": 6, "int4": 24},
+                    [{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
+                ),
+                (
+                    "image-text-to-text",
+                    "phi3_v",
+                    "int4 --group-size 4 --ratio 0.8 --trust-remote-code",
+                    [{"int8": 8, "int4": 10}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
                 ),
                 (
                     "image-text-to-text",
                     "phi3_v",
-                    'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" '
+                    'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
                     "--dataset contextual --num-samples 1 --trust-remote-code",
-                    {"int8": 4, "int4": 14},
+                    [{"int8": 4, "int4": 14}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
                 ),
             ]
         )
@@ -299,7 +348,9 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in
             self.assertEqual(exp_num_fq, num_fq)
 
     @parameterized.expand(TEST_4BIT_CONFIGURATIONS)
-    def test_exporters_cli_4bit(self, task: str, model_type: str, option: str, expected_num_weight_nodes: dict):
+    def test_exporters_cli_4bit(
+        self, task: str, model_type: str, option: str, expected_num_weight_nodes_per_model: List[Dict]
+    ):
         with TemporaryDirectory() as tmpdir:
             result = subprocess.run(
                 f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
@@ -316,11 +367,15 @@ def test_exporters_cli_4bit(self, task: str, model_type: str, option: str, expec
                 else _HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]
             ).from_pretrained(tmpdir, **model_kwargs)
 
-            ov_model = model.lm_model if task == "image-text-to-text" else model.model
+            submodels = []
+            if task == "text-generation-with-past":
+                submodels = [model]
+            elif task == "image-text-to-text":
+                submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
+                submodels += [getattr(model, part) for part in model.additional_parts]
+
+            compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)
 
-            _, num_weight_nodes = get_num_quantized_nodes(ov_model)
-            expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
-            self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
             self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout)
             self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout)
             self.assertTrue("--gptq" not in option or b"Applying GPTQ" in result.stdout)
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 5751bfdaae..ebc7725d43 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -76,7 +76,12 @@
 
 from optimum.intel.openvino.quantization import InferRequestWrapper
 from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version
-from utils_tests import MODEL_NAMES, get_num_quantized_nodes, _ARCHITECTURES_TO_EXPECTED_INT8
+from utils_tests import (
+    MODEL_NAMES,
+    get_num_quantized_nodes,
+    _ARCHITECTURES_TO_EXPECTED_INT8,
+    compare_num_quantized_nodes_per_model,
+)
 
 _TASK_TO_DATASET = {
     "text-generation": ("wikitext", "wikitext-2-raw-v1", "text"),
@@ -238,21 +243,26 @@ class OVWeightCompressionTest(unittest.TestCase):
             "gpt2",  # model name
             False,  # trust remote code
             dict(bits=4, sym=False, group_size=-1, ratio=0.8),  # quantization config
-            {"int4": 30, "int8": 14},  # reference number of low-precision nodes
+            [{"int8": 14, "int4": 30}],  # reference number of low-precision nodes
         ),
         (
             OVModelForCausalLM,
             "gpt2",
             False,
             dict(bits=4, weight_format="mxfp4", group_size=32),
-            {"f4e2m1": 20, "f8e8m0": 20, "int8": 4},
+            [{"int8": 4, "f4e2m1": 20, "f8e8m0": 20}],
         ),
         (
             OVModelForCausalLM,
             "gpt2",
             False,
             dict(bits=4, weight_format="nf4", group_size=32),
-            {"nf4": 20, "int8": 4},
+            [
+                {
+                    "int8": 4,
+                    "nf4": 20,
+                }
+            ],
         ),
         (
             OVModelForCausalLM,
@@ -264,14 +274,14 @@ class OVWeightCompressionTest(unittest.TestCase):
                 group_size=32,
                 ignored_scope={"names": ["__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul"]},
             ),
-            {"int4": 38, "int8": 4},
+            [{"int8": 4, "int4": 38}],
         ),
         (
             OVModelForCausalLM,
             "gpt2",
             False,
             dict(bits=4, sym=False, group_size=-1, ratio=0.8, all_layers=True),
-            {"int4": 26, "int8": 18},
+            [{"int8": 18, "int4": 26}],
         ),
         (
             OVModelForCausalLM,
@@ -285,7 +295,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 sensitivity_metric="mean_activation_magnitude",
                 dataset="c4",
             ),
-            {"int4": 25, "int8": 14},
+            [{"int8": 14, "int4": 25}],
         ),
         (
             OVModelForCausalLM,
@@ -299,7 +309,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 sensitivity_metric="mean_activation_magnitude",
                 dataset=["one two, " * i for i in range(10)],
             ),
-            {"int4": 24, "int8": 16},
+            [{"int8": 16, "int4": 24}],
         ),
         (
             OVModelForCausalLM,
@@ -315,7 +325,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 quant_method=QuantizationMethod.AWQ,
                 scale_estimation=True,
             ),
-            {"int4": 12, "int8": 8},
+            [{"int8": 8, "int4": 12}],
         ),
         (
             OVModelForCausalLM,
@@ -330,7 +340,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 dataset="c4",
                 quant_method="awq",
             ),
-            {"int4": 12, "int8": 8},
+            [{"int8": 8, "int4": 12}],
         ),
         (
             OVModelForCausalLM,
@@ -345,7 +355,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 dataset="c4",
                 gptq=True,
             ),
-            {"int4": 12, "int8": 8},
+            [{"int8": 8, "int4": 12}],
         ),
         (
             OVModelForCausalLM,
@@ -358,14 +368,35 @@ class OVWeightCompressionTest(unittest.TestCase):
                 dataset="auto",
                 lora_correction=True,
             ),
-            {"int4": 28, "int8": 60},
+            [{"int8": 60, "int4": 28}],
         ),
         (
             OVModelForCausalLM,
             "llama_awq",
             False,
             dict(bits=4, backup_precision="none", group_size=16),
-            {"int4": 28},
+            [{"int4": 28}],
+        ),
+        (
+            OVModelForCausalLM,
+            "llama_awq",
+            False,
+            dict(bits=4, backup_precision="none", group_size=16, ratio=0.5),
+            [{"int4": 12}],
+        ),
+        (
+            OVModelForCausalLM,
+            "llama_awq",
+            False,
+            dict(bits=4, backup_precision="int8_sym", group_size=16, ratio=0.5),
+            [{"int4": 12, "int8": 10}],
+        ),
+        (
+            OVModelForCausalLM,
+            "llama_awq",
+            False,
+            dict(bits=4, backup_precision="int8_asym", group_size=16, ratio=0.5),
+            [{"int4": 12, "int8": 20}],
         ),
     ]
 
@@ -385,7 +416,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                         num_samples=1,
                         processor=MODEL_NAMES["llava_next"],
                     ),
-                    {"int4": 24, "int8": 6},
+                    [{"int8": 6, "int4": 24}, {"int8": 9}, {"int8": 1}],
                 ),
                 (
                     OVModelForVisualCausalLM,
@@ -396,13 +427,13 @@ class OVWeightCompressionTest(unittest.TestCase):
                         group_size=8,
                         dataset="contextual",
                         ratio=0.8,
-                        sensitivity_metric="mean_activation_magnitude",
+                        sensitivity_metric="mean_activation_variance",
                         num_samples=1,
                         processor=MODEL_NAMES["nanollava_vision_tower"],
                         tokenizer=MODEL_NAMES["nanollava"],
                         trust_remote_code=True,
                     ),
-                    {"int4": 16, "int8": 14},
+                    [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
                 ),
             ]
         )
@@ -424,7 +455,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                         processor=MODEL_NAMES["minicpmv"],
                         trust_remote_code=True,
                     ),
-                    {"int4": 22, "int8": 8},
+                    [{"int8": 8, "int4": 22}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
                 ),
                 (
                     OVModelForVisualCausalLM,
@@ -439,7 +470,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                         num_samples=1,
                         trust_remote_code=True,
                     ),
-                    {"int4": 22, "int8": 8},
+                    [{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
                 ),
                 (
                     OVModelForVisualCausalLM,
@@ -454,7 +485,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                         num_samples=1,
                         trust_remote_code=True,
                     ),
-                    {"int4": 14, "int8": 4},
+                    [{"int8": 4, "int4": 14}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
                 ),
             ]
         )
@@ -721,7 +752,7 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_
 
     @parameterized.expand(LOAD_IN_4_BITS_SCOPE)
     def test_ovmodel_4bit_auto_compression_with_config(
-        self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes
+        self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes_per_model
     ):
         model_id = MODEL_NAMES[model_name]
         with TemporaryDirectory() as tmp_dir:
@@ -733,14 +764,17 @@ def test_ovmodel_4bit_auto_compression_with_config(
                 # TODO: Check that AWQ was actually applied
                 pass
 
-            ov_model = model.lm_model if model_cls == OVModelForVisualCausalLM else model.model
+            submodels = []
+            if isinstance(model, OVModelForCausalLM):
+                submodels = [model.model]
+            elif isinstance(model, OVModelForVisualCausalLM):
+                submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
+                submodels += [getattr(model, part) for part in model.additional_parts]
+            compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)
 
-            _, num_weight_nodes = get_num_quantized_nodes(ov_model)
-            expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
-            self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
             model.save_pretrained(tmp_dir)
-
-            wc_rt_info = ov_model.get_rt_info()["nncf"]["weight_compression"]
+            # At the moment the first model in the list is the only one we apply data-aware compression to
+            wc_rt_info = submodels[0].get_rt_info()["nncf"]["weight_compression"]
             self.assertEqual(quantization_config.quant_method.lower() == "awq", wc_rt_info["awq"].value == "True")
             self.assertEqual(
                 quantization_config.scale_estimation or False, wc_rt_info["scale_estimation"].value == "True"
@@ -868,9 +902,9 @@ def main_export_not_in_stacktrace(*args, **kwargs):
                 }
                 compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params)
 
-    @parameterized.expand(LOAD_IN_4_BITS_SCOPE)
+    @parameterized.expand(LOAD_IN_4_BITS_SCOPE[::5])
     def test_ovmodel_4bit_dynamic_with_config(
-        self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes
+        self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes_per_model
     ):
         model_id = MODEL_NAMES[model_name]
         with TemporaryDirectory() as tmp_dir:
@@ -884,13 +918,15 @@ def test_ovmodel_4bit_dynamic_with_config(
             self.assertEqual(model.ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"], str(group_size))
             self.assertEqual(model.ov_config["KV_CACHE_PRECISION"], "u8")
 
-            ov_model = model.lm_model if model_cls == OVModelForVisualCausalLM else model.model
+            submodels = []
+            if isinstance(model, OVModelForCausalLM):
+                submodels = [model.model]
+            elif isinstance(model, OVModelForVisualCausalLM):
+                submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
+                submodels += [getattr(model, part) for part in model.additional_parts]
+            compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)
 
-            _, num_weight_nodes = get_num_quantized_nodes(ov_model)
-            expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
-            self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
             model.save_pretrained(tmp_dir)
-
             openvino_config = OVConfig.from_pretrained(tmp_dir)
             self.assertEqual(openvino_config.quantization_config.bits, 4)
             self.assertEqual(openvino_config.dtype, quantization_config.weight_format)
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index bf509a044f..6130ee06cd 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -11,11 +11,15 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import unittest
+from typing import Dict, List, Union
 
 import numpy as np
 import openvino as ov
 import torch
 
+from optimum.intel.openvino.modeling_base import OVBaseModel
+
 
 MODEL_NAMES = {
     "albert": "hf-internal-testing/tiny-random-albert",
@@ -218,3 +222,17 @@ def get_num_quantized_nodes(model):
             if type_name == "nf4":
                 num_weight_nodes["nf4"] += 1
     return num_fake_quantize, num_weight_nodes
+
+
+def compare_num_quantized_nodes_per_model(
+    test_case: unittest.TestCase,
+    models: List[Union[ov.Model, OVBaseModel]],
+    expected_num_weight_nodes_per_model: List[Dict],
+):
+    test_case.assertEqual(len(models), len(expected_num_weight_nodes_per_model))
+    actual_num_weights_per_model = []
+    for submodel, expected_num_weight_nodes in zip(models, expected_num_weight_nodes_per_model):
+        _, num_weight_nodes = get_num_quantized_nodes(submodel)
+        expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
+        actual_num_weights_per_model.append(num_weight_nodes)
+    test_case.assertEqual(expected_num_weight_nodes_per_model, actual_num_weights_per_model)

From f3508f4a820ee99b956eef29488a8786166d985a Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 10 Dec 2024 15:16:02 +0100
Subject: [PATCH 3/6] Update references

---
 tests/openvino/test_exporters_cli.py | 2 +-
 tests/openvino/test_quantization.py  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 4fd2627b60..cf4cbd4f3e 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -148,7 +148,7 @@ class OVCLIExportTestCase(unittest.TestCase):
             "text-generation-with-past",
             "llama_awq",
             "int4 --group-size 16 --backup-precision none --ratio 0.5",
-            [{"int4": 12}],
+            [{"int4": 6}],
         ),
     ]
 
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index ebc7725d43..472c3bdbd6 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -382,21 +382,21 @@ class OVWeightCompressionTest(unittest.TestCase):
             "llama_awq",
             False,
             dict(bits=4, backup_precision="none", group_size=16, ratio=0.5),
-            [{"int4": 12}],
+            [{"int4": 6}],
         ),
         (
             OVModelForCausalLM,
             "llama_awq",
             False,
             dict(bits=4, backup_precision="int8_sym", group_size=16, ratio=0.5),
-            [{"int4": 12, "int8": 10}],
+            [{"int4": 6, "int8": 13}],
         ),
         (
             OVModelForCausalLM,
             "llama_awq",
             False,
             dict(bits=4, backup_precision="int8_asym", group_size=16, ratio=0.5),
-            [{"int4": 12, "int8": 20}],
+            [{"int4": 6, "int8": 26}],
         ),
     ]
 

From 3b0851ddec293ba9e659ce2082816e523619c247 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 12 Dec 2024 12:06:49 +0100
Subject: [PATCH 4/6] Fix condition

---
 optimum/commands/export/openvino.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 54570d3bfe..1c01e74646 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -361,7 +361,11 @@ def run(self):
             model.save_pretrained(self.args.output)
             if not self.args.disable_convert_tokenizer:
                 maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
-        elif task.startswith("text-generation") and quantize_with_dataset or task == "image-text-to-text":
+        elif (
+            task.startswith("text-generation")
+            and quantize_with_dataset
+            or (task == "image-text-to-text" and quantization_config is not None)
+        ):
             if task.startswith("text-generation"):
                 from optimum.intel import OVModelForCausalLM
 

From 2f3ca06f6f0ca1c1154b19601c73fed98bc00f9d Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 12 Dec 2024 16:01:53 +0100
Subject: [PATCH 5/6] Export in auto dtype if possible

---
 optimum/intel/openvino/modeling_visual_language.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 8d6edea0f0..a7c2210082 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -598,7 +598,8 @@ def _from_transformers(
         if load_in_8bit is None and not quantization_config:
             ov_config = None
         else:
-            ov_config = OVConfig(dtype="fp32")
+            # Export in fp32 if compression won't be applied later
+            ov_config = OVConfig(dtype="fp32" if load_in_8bit is False else "auto")
 
         stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)
 

From 422ffe0d2da6c4fe48f91a8c150cb4ecf2b0b5d2 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 12 Dec 2024 16:07:45 +0100
Subject: [PATCH 6/6] Reformat condition

---
 optimum/commands/export/openvino.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 1c01e74646..db8e68f9e2 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -361,10 +361,8 @@ def run(self):
             model.save_pretrained(self.args.output)
             if not self.args.disable_convert_tokenizer:
                 maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
-        elif (
-            task.startswith("text-generation")
-            and quantize_with_dataset
-            or (task == "image-text-to-text" and quantization_config is not None)
+        elif (task.startswith("text-generation") and quantize_with_dataset) or (
+            task == "image-text-to-text" and quantization_config is not None
         ):
             if task.startswith("text-generation"):
                 from optimum.intel import OVModelForCausalLM