Skip to content

Commit

Permalink
fix load_in_int8 behaviour for large models
Browse files Browse the repository at this point in the history
  • Loading branch information
eaidova committed Dec 25, 2023
1 parent 173aacd commit 3bc3d01
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 8 deletions.
8 changes: 6 additions & 2 deletions optimum/intel/openvino/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def _from_transformers(
local_files_only: bool = False,
task: Optional[str] = None,
trust_remote_code: bool = False,
load_in_8bit: bool = False,
load_in_8bit: Optional[bool] = None,
**kwargs,
):
"""
Expand All @@ -283,6 +283,10 @@ def _from_transformers(
save_dir = TemporaryDirectory()
save_dir_path = Path(save_dir.name)

compression_option = None
if load_in_8bit is not None:
compression_option = "int8" if load_in_8bit else "fp32"

main_export(
model_name_or_path=model_id,
output=save_dir_path,
Expand All @@ -294,7 +298,7 @@ def _from_transformers(
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
int8=load_in_8bit,
compression_option=compression_option,
)

config.save_pretrained(save_dir_path)
Expand Down
7 changes: 5 additions & 2 deletions optimum/intel/openvino/modeling_base_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def _from_transformers(
task: Optional[str] = None,
use_cache: bool = True,
trust_remote_code: bool = False,
load_in_8bit: bool = False,
load_in_8bit: Optional[bool] = None,
**kwargs,
):
"""
Expand Down Expand Up @@ -251,6 +251,9 @@ def _from_transformers(
if use_cache:
task = task + "-with-past"

compression_option = None
if load_in_8bit is not None:
compression_option = "int8" if load_in_8bit else "fp32"
main_export(
model_name_or_path=model_id,
output=save_dir_path,
Expand All @@ -262,7 +265,7 @@ def _from_transformers(
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
compression_option="int8" if load_in_8bit else None,
compression_option=compression_option,
)

config.save_pretrained(save_dir_path)
Expand Down
7 changes: 5 additions & 2 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def _from_transformers(
task: Optional[str] = None,
use_cache: bool = True,
trust_remote_code: bool = False,
load_in_8bit: bool = False,
load_in_8bit: Optional[bool] = None,
**kwargs,
):
if config.model_type.replace("_", "-") not in _SUPPORTED_ARCHITECTURES:
Expand All @@ -228,6 +228,9 @@ def _from_transformers(
if use_cache:
task = task + "-with-past"

compression_option = None
if load_in_8bit is not None:
compression_option = "int8" if load_in_8bit else "fp32"
main_export(
model_name_or_path=model_id,
output=save_dir_path,
Expand All @@ -239,7 +242,7 @@ def _from_transformers(
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
compression_option="int8" if load_in_8bit else None,
compression_option=compression_option,
)

config.is_decoder = True
Expand Down
8 changes: 6 additions & 2 deletions optimum/intel/openvino/modeling_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,13 +286,17 @@ def _from_transformers(
tokenizer: Optional["CLIPTokenizer"] = None,
scheduler: Union["DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler"] = None,
feature_extractor: Optional["CLIPFeatureExtractor"] = None,
load_in_8bit: bool = False,
load_in_8bit: Optional[bool] = None,
tokenizer_2: Optional["CLIPTokenizer"] = None,
**kwargs,
):
save_dir = TemporaryDirectory()
save_dir_path = Path(save_dir.name)

compression_option = None
if load_in_8bit is not None:
compression_option = "int8" if load_in_8bit else "fp32"

main_export(
model_name_or_path=model_id,
output=save_dir_path,
Expand All @@ -304,7 +308,7 @@ def _from_transformers(
use_auth_token=use_auth_token,
local_files_only=local_files_only,
force_download=force_download,
int8=load_in_8bit,
compression_option=compression_option,
)

return cls._from_pretrained(
Expand Down
32 changes: 32 additions & 0 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,38 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type):
_, num_int8, _ = get_num_quantized_nodes(model)
self.assertEqual(0, num_int8)

def test_ovmodel_load_large_model_with_default_compressed_weights(self):
with unittest.mock.patch("transformers.modeling_utils.ModuleUtilsMixin") as model_mixin_patch:
model_mixin_patch.num_parameters.return_value = 2e9
with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch:
with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch:
_ = OVModelForCausalLM.from_pretrained(
MODEL_NAMES["llama"], export=True, compile=False, use_cache=False
)
saving_params = {
"model": unittest.mock.ANY,
"path": unittest.mock.ANY,
"compression_option": "int8",
"compression_ratio": None,
}
save_model_patch.aasert_called_with(saving_params)

def test_ovmodel_load_large_model_with_uncompressed_weights(self):
with unittest.mock.patch("transformers.modeling_utils.ModuleUtilsMixin") as model_mixin_patch:
model_mixin_patch.num_parameters.return_value = 2e9
with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch:
with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch:
_ = OVModelForCausalLM.from_pretrained(
MODEL_NAMES["llama"], export=True, load_in_8bit=False, compile=False, use_cache=False
)
saving_params = {
"model": unittest.mock.ANY,
"path": unittest.mock.ANY,
"compression_option": "fp32",
"compression_ratio": None,
}
save_model_patch.aasert_called_with(saving_params)


class OVQuantizerQATest(unittest.TestCase):
SUPPORTED_ARCHITECTURES = (("hf-internal-testing/tiny-random-BertForQuestionAnswering",),)
Expand Down

0 comments on commit 3bc3d01

Please sign in to comment.