Changed the logic of default 8-bit weights compression (#445)

* Added 8bit compression for decoders larger than 1B * Style * Fixed issue * Fixed one more issue * Added warning for nncf absense in case of default compression to 8 bits * Fixed an issue. Added warning message when NNCF is not available * Revised logic of the default INT8 export * Added tests for auto weights compression * Updated references
huggingface · Oct 4, 2023 · d761009 · d761009
1 parent d207110
commit d761009
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 18 deletions.
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -235,19 +235,17 @@ def main_export(
         onnx_config_constructor = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task)
         onnx_config = onnx_config_constructor(model.config)
         models_and_onnx_configs = {"model": (model, onnx_config)}
-        if model_kwargs is None:
-            model_kwargs = {}
+        model_kwargs = model_kwargs or {}
         load_in_8bit = model_kwargs.get("load_in_8bit", None)
         if load_in_8bit is None:
             if model.num_parameters() >= _MAX_UNCOMPRESSED_DECODER_SIZE:
-                model_kwargs["load_in_8bit"] = True
-            else:
-                model_kwargs["load_in_8bit"] = False
-        else:
-            if not is_nncf_available():
-                raise ImportError(
-                    "Quantization of the weights to int8 requires nncf, please install it with `pip install nncf`"
-                )
+                if not is_nncf_available():
+                    logger.warning(
+                        "The model will be converted with no weights quantization. Quantization of the weights to int8 requires nncf."
+                        "please install it with `pip install nncf`"
+                    )
+                else:
+                    model_kwargs["load_in_8bit"] = True
 
     if not is_stable_diffusion:
         needs_pad_token_id = (

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -56,10 +56,10 @@
 def _save_model(model, path: str, compress_to_fp16=False, load_in_8bit=False):
     if load_in_8bit:
         if not is_nncf_available():
-            logger.warning(
-                "The model will be converted with no weights quantization. Quantization of the weights to int8 requires nncf."
-                "please install it with `pip install nncf`"
+            raise ImportError(
+                "Quantization of the weights to int8 requires nncf, please install it with `pip install nncf`"
             )
+
         import nncf
 
         model = nncf.compress_weights(model)

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -150,6 +150,8 @@ class OVWeightCompressionTest(unittest.TestCase):
         (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 45, 22),
     )
 
+    UPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION = ((OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 22),)
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_COMPRESSED_MATMULS)
     def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8):
         task = model_cls.export_feature
@@ -197,6 +199,18 @@ def test_ovmodel_weight_compression(self, model_cls, model_name, expected_pt_int
             outputs = model(**tokens)
             self.assertTrue("logits" in outputs)
 
+    @parameterized.expand(UPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
+    def test_ovmodel_load_with_compressed_weights(self, model_cls, model_name, expected_ov_int8):
+        model = model_cls.from_pretrained(model_name, export=True, load_in_8bit=True)
+        _, num_int8 = get_num_quantized_nodes(model)
+        self.assertEqual(expected_ov_int8, num_int8)
+
+    @parameterized.expand(UPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
+    def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_name, expected_ov_int8):
+        model = model_cls.from_pretrained(model_name, export=True, load_in_8bit=False)
+        _, num_int8 = get_num_quantized_nodes(model)
+        self.assertEqual(0, num_int8)
+
 
 class OVQuantizerQATest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = (("hf-internal-testing/tiny-random-BertForQuestionAnswering",),)

diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py
@@ -715,7 +715,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
         model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2],
         expected_fake_quantize=48,
-        expected_int8=31,
+        expected_int8=30,
         compression_metrics=["compression_loss"],
     ),
     "structured_movement_sparsity": OVTrainerTestDescriptor(
@@ -734,15 +734,15 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
         model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
         expected_fake_quantize=48,
-        expected_int8=31,
+        expected_int8=30,
         expected_binary_masks=48,
         compression_metrics=["compression_loss"],
     ),
     "quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor(
         model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
         expected_fake_quantize=48,
-        expected_int8=31,
+        expected_int8=30,
         expected_binary_masks=48,
         compression_metrics=["compression_loss"],
     ),
@@ -751,7 +751,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
         teacher_model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
         expected_fake_quantize=48,
-        expected_int8=31,
+        expected_int8=30,
         expected_binary_masks=48,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
@@ -760,7 +760,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
         teacher_model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
         expected_fake_quantize=48,
-        expected_int8=31,
+        expected_int8=30,
         expected_binary_masks=48,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),