fix load_in_int8 behaviour for large models

huggingface · Dec 25, 2023 · 3bc3d01 · 3bc3d01
1 parent 173aacd
commit 3bc3d01
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 8 deletions.
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
@@ -259,7 +259,7 @@ def _from_transformers(
         local_files_only: bool = False,
         task: Optional[str] = None,
         trust_remote_code: bool = False,
-        load_in_8bit: bool = False,
+        load_in_8bit: Optional[bool] = None,
         **kwargs,
     ):
         """
@@ -283,6 +283,10 @@ def _from_transformers(
         save_dir = TemporaryDirectory()
         save_dir_path = Path(save_dir.name)
 
+        compression_option = None
+        if load_in_8bit is not None:
+            compression_option = "int8" if load_in_8bit else "fp32"
+
         main_export(
             model_name_or_path=model_id,
             output=save_dir_path,
@@ -294,7 +298,7 @@ def _from_transformers(
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
-            int8=load_in_8bit,
+            compression_option=compression_option,
         )
 
         config.save_pretrained(save_dir_path)

diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -220,7 +220,7 @@ def _from_transformers(
         task: Optional[str] = None,
         use_cache: bool = True,
         trust_remote_code: bool = False,
-        load_in_8bit: bool = False,
+        load_in_8bit: Optional[bool] = None,
         **kwargs,
     ):
         """
@@ -251,6 +251,9 @@ def _from_transformers(
             if use_cache:
                 task = task + "-with-past"
 
+        compression_option = None
+        if load_in_8bit is not None:
+            compression_option = "int8" if load_in_8bit else "fp32"
         main_export(
             model_name_or_path=model_id,
             output=save_dir_path,
@@ -262,7 +265,7 @@ def _from_transformers(
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
-            compression_option="int8" if load_in_8bit else None,
+            compression_option=compression_option,
         )
 
         config.save_pretrained(save_dir_path)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
@@ -211,7 +211,7 @@ def _from_transformers(
         task: Optional[str] = None,
         use_cache: bool = True,
         trust_remote_code: bool = False,
-        load_in_8bit: bool = False,
+        load_in_8bit: Optional[bool] = None,
         **kwargs,
     ):
         if config.model_type.replace("_", "-") not in _SUPPORTED_ARCHITECTURES:
@@ -228,6 +228,9 @@ def _from_transformers(
             if use_cache:
                 task = task + "-with-past"
 
+        compression_option = None
+        if load_in_8bit is not None:
+            compression_option = "int8" if load_in_8bit else "fp32"
         main_export(
             model_name_or_path=model_id,
             output=save_dir_path,
@@ -239,7 +242,7 @@ def _from_transformers(
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
-            compression_option="int8" if load_in_8bit else None,
+            compression_option=compression_option,
         )
 
         config.is_decoder = True

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
@@ -286,13 +286,17 @@ def _from_transformers(
         tokenizer: Optional["CLIPTokenizer"] = None,
         scheduler: Union["DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler"] = None,
         feature_extractor: Optional["CLIPFeatureExtractor"] = None,
-        load_in_8bit: bool = False,
+        load_in_8bit: Optional[bool] = None,
         tokenizer_2: Optional["CLIPTokenizer"] = None,
         **kwargs,
     ):
         save_dir = TemporaryDirectory()
         save_dir_path = Path(save_dir.name)
 
+        compression_option = None
+        if load_in_8bit is not None:
+            compression_option = "int8" if load_in_8bit else "fp32"
+
         main_export(
             model_name_or_path=model_id,
             output=save_dir_path,
@@ -304,7 +308,7 @@ def _from_transformers(
             use_auth_token=use_auth_token,
             local_files_only=local_files_only,
             force_download=force_download,
-            int8=load_in_8bit,
+            compression_option=compression_option,
         )
 
         return cls._from_pretrained(

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -272,6 +272,38 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type):
             _, num_int8, _ = get_num_quantized_nodes(model)
             self.assertEqual(0, num_int8)
 
+    def test_ovmodel_load_large_model_with_default_compressed_weights(self):
+        with unittest.mock.patch("transformers.modeling_utils.ModuleUtilsMixin") as model_mixin_patch:
+            model_mixin_patch.num_parameters.return_value = 2e9
+            with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch:
+                with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch:
+                    _ = OVModelForCausalLM.from_pretrained(
+                        MODEL_NAMES["llama"], export=True, compile=False, use_cache=False
+                    )
+                    saving_params = {
+                        "model": unittest.mock.ANY,
+                        "path": unittest.mock.ANY,
+                        "compression_option": "int8",
+                        "compression_ratio": None,
+                    }
+                    save_model_patch.aasert_called_with(saving_params)
+
+    def test_ovmodel_load_large_model_with_uncompressed_weights(self):
+        with unittest.mock.patch("transformers.modeling_utils.ModuleUtilsMixin") as model_mixin_patch:
+            model_mixin_patch.num_parameters.return_value = 2e9
+            with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch:
+                with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch:
+                    _ = OVModelForCausalLM.from_pretrained(
+                        MODEL_NAMES["llama"], export=True, load_in_8bit=False, compile=False, use_cache=False
+                    )
+                    saving_params = {
+                        "model": unittest.mock.ANY,
+                        "path": unittest.mock.ANY,
+                        "compression_option": "fp32",
+                        "compression_ratio": None,
+                    }
+                    save_model_patch.aasert_called_with(saving_params)
+
 
 class OVQuantizerQATest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = (("hf-internal-testing/tiny-random-BertForQuestionAnswering",),)