From 3bc3d010cc30ca757e94a8cb791df9a42592014b Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 25 Dec 2023 16:40:59 +0400 Subject: [PATCH] fix load_in_int8 behaviour for large models --- optimum/intel/openvino/modeling_base.py | 8 +++-- .../intel/openvino/modeling_base_seq2seq.py | 7 ++-- optimum/intel/openvino/modeling_decoder.py | 7 ++-- optimum/intel/openvino/modeling_diffusion.py | 8 +++-- tests/openvino/test_quantization.py | 32 +++++++++++++++++++ 5 files changed, 54 insertions(+), 8 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 67e8d20502..54eb2f44db 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -259,7 +259,7 @@ def _from_transformers( local_files_only: bool = False, task: Optional[str] = None, trust_remote_code: bool = False, - load_in_8bit: bool = False, + load_in_8bit: Optional[bool] = None, **kwargs, ): """ @@ -283,6 +283,10 @@ def _from_transformers( save_dir = TemporaryDirectory() save_dir_path = Path(save_dir.name) + compression_option = None + if load_in_8bit is not None: + compression_option = "int8" if load_in_8bit else "fp32" + main_export( model_name_or_path=model_id, output=save_dir_path, @@ -294,7 +298,7 @@ def _from_transformers( local_files_only=local_files_only, force_download=force_download, trust_remote_code=trust_remote_code, - int8=load_in_8bit, + compression_option=compression_option, ) config.save_pretrained(save_dir_path) diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 3471c6f954..485db25bc5 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -220,7 +220,7 @@ def _from_transformers( task: Optional[str] = None, use_cache: bool = True, trust_remote_code: bool = False, - load_in_8bit: bool = False, + load_in_8bit: Optional[bool] = None, **kwargs, ): """ @@ -251,6 +251,9 @@ def _from_transformers( if use_cache: task = task + "-with-past" + compression_option = None + if load_in_8bit is not None: + compression_option = "int8" if load_in_8bit else "fp32" main_export( model_name_or_path=model_id, output=save_dir_path, @@ -262,7 +265,7 @@ def _from_transformers( local_files_only=local_files_only, force_download=force_download, trust_remote_code=trust_remote_code, - compression_option="int8" if load_in_8bit else None, + compression_option=compression_option, ) config.save_pretrained(save_dir_path) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 8147cc74e8..73794c8330 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -211,7 +211,7 @@ def _from_transformers( task: Optional[str] = None, use_cache: bool = True, trust_remote_code: bool = False, - load_in_8bit: bool = False, + load_in_8bit: Optional[bool] = None, **kwargs, ): if config.model_type.replace("_", "-") not in _SUPPORTED_ARCHITECTURES: @@ -228,6 +228,9 @@ def _from_transformers( if use_cache: task = task + "-with-past" + compression_option = None + if load_in_8bit is not None: + compression_option = "int8" if load_in_8bit else "fp32" main_export( model_name_or_path=model_id, output=save_dir_path, @@ -239,7 +242,7 @@ def _from_transformers( local_files_only=local_files_only, force_download=force_download, trust_remote_code=trust_remote_code, - compression_option="int8" if load_in_8bit else None, + compression_option=compression_option, ) config.is_decoder = True diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 4443381cd6..f51b60f63b 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -286,13 +286,17 @@ def _from_transformers( tokenizer: Optional["CLIPTokenizer"] = None, scheduler: Union["DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler"] = None, feature_extractor: Optional["CLIPFeatureExtractor"] = None, - load_in_8bit: bool = False, + load_in_8bit: Optional[bool] = None, tokenizer_2: Optional["CLIPTokenizer"] = None, **kwargs, ): save_dir = TemporaryDirectory() save_dir_path = Path(save_dir.name) + compression_option = None + if load_in_8bit is not None: + compression_option = "int8" if load_in_8bit else "fp32" + main_export( model_name_or_path=model_id, output=save_dir_path, @@ -304,7 +308,7 @@ def _from_transformers( use_auth_token=use_auth_token, local_files_only=local_files_only, force_download=force_download, - int8=load_in_8bit, + compression_option=compression_option, ) return cls._from_pretrained( diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index c3378c08e6..a08da51aab 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -272,6 +272,38 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type): _, num_int8, _ = get_num_quantized_nodes(model) self.assertEqual(0, num_int8) + def test_ovmodel_load_large_model_with_default_compressed_weights(self): + with unittest.mock.patch("transformers.modeling_utils.ModuleUtilsMixin") as model_mixin_patch: + model_mixin_patch.num_parameters.return_value = 2e9 + with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch: + with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch: + _ = OVModelForCausalLM.from_pretrained( + MODEL_NAMES["llama"], export=True, compile=False, use_cache=False + ) + saving_params = { + "model": unittest.mock.ANY, + "path": unittest.mock.ANY, + "compression_option": "int8", + "compression_ratio": None, + } + save_model_patch.aasert_called_with(saving_params) + + def test_ovmodel_load_large_model_with_uncompressed_weights(self): + with unittest.mock.patch("transformers.modeling_utils.ModuleUtilsMixin") as model_mixin_patch: + model_mixin_patch.num_parameters.return_value = 2e9 + with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch: + with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch: + _ = OVModelForCausalLM.from_pretrained( + MODEL_NAMES["llama"], export=True, load_in_8bit=False, compile=False, use_cache=False + ) + saving_params = { + "model": unittest.mock.ANY, + "path": unittest.mock.ANY, + "compression_option": "fp32", + "compression_ratio": None, + } + save_model_patch.aasert_called_with(saving_params) + class OVQuantizerQATest(unittest.TestCase): SUPPORTED_ARCHITECTURES = (("hf-internal-testing/tiny-random-BertForQuestionAnswering",),)