From f9702721661c22ac60ea017e66657ceadbe5f1bf Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Sun, 24 Mar 2024 22:39:34 +0800 Subject: [PATCH] Load weight-only quantized model with INCModelForCausalLM Signed-off-by: Cheng, Penghui --- .../language-modeling/run_clm.py | 6 +--- optimum/intel/neural_compressor/__init__.py | 4 --- .../intel/neural_compressor/modeling_base.py | 31 ++++++++++++++++--- tests/neural_compressor/test_optimization.py | 3 +- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/examples/neural_compressor/language-modeling/run_clm.py b/examples/neural_compressor/language-modeling/run_clm.py index 746b7261b5..f169a621e0 100644 --- a/examples/neural_compressor/language-modeling/run_clm.py +++ b/examples/neural_compressor/language-modeling/run_clm.py @@ -63,7 +63,6 @@ if is_intel_extension_for_transformers_available(): from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig - from optimum.intel.neural_compressor import ITREXAutoModelForCausalLM os.environ["CUDA_VISIBLE_DEVICES"] = "" @@ -777,10 +776,7 @@ def compute_metrics(eval_preds): trainer.model = quantizer._quantized_model if optim_args.apply_quantization and optim_args.verify_loading: - if optim_args.quantization_approach == "weight_only": - loaded_model = ITREXAutoModelForCausalLM.from_pretrained(training_args.output_dir) - else: - loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir) + loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir) tokens = tokenizer("This is a sample input", return_tensors="pt") with torch.no_grad(): original_model_outputs = trainer.model(**tokens) diff --git a/optimum/intel/neural_compressor/__init__.py b/optimum/intel/neural_compressor/__init__.py index f3a7bffe69..2daecfbc93 100644 --- a/optimum/intel/neural_compressor/__init__.py +++ b/optimum/intel/neural_compressor/__init__.py @@ -32,7 +32,3 @@ if is_diffusers_available(): from .modeling_diffusion import INCStableDiffusionPipeline - - -if is_intel_extension_for_transformers_available(): - from .modeling_base import ITREXAutoModelForCausalLM diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index 0226855d64..01d071bdd2 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -65,11 +65,7 @@ if is_intel_extension_for_transformers_available(): from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM as ITREX_WOQ_MODEL - - class ITREXAutoModelForCausalLM(ITREX_WOQ_MODEL): - auto_model_class = AutoModelForCausalLM - export_feature = "text-generation" - + from intel_extension_for_transformers.transformers.utils import WeightOnlyQuantConfig class INCModel(OptimizedModel): auto_model_class = AutoModel @@ -138,6 +134,31 @@ def _from_pretrained( model_save_dir = Path(model_cache_path).parent inc_config = None msg = None + try: + quantization_config = WeightOnlyQuantConfig.from_pretrained(model_id) + if getattr(quantization_config, "algorithm", None) is not None and quantization_config.algorithm.lower() in [ + "rtn", "gptq", "awq", "autoaround" + ]: + if not is_intel_extension_for_transformers_available(): + raise ImportError( + "Didn't find out intel-etension-for-transformers package. " + "Please install packages: pip install intel-etension-for-transformers and pip install peft." + ) + return ITREX_WOQ_MODEL.from_pretrained( + pretrained_model_name_or_path=model_id, + use_auth_token=use_auth_token, + revision=revision, + force_download=force_download, + cache_dir=cache_dir, + local_files_only=local_files_only, + subfolder=subfolder, + trust_remote_code=trust_remote_code, + **kwargs, + ) + except EnvironmentError: + msg = ( + "The model is not quantized with weight-only quantization." + ) try: inc_config = INCConfig.from_pretrained(model_id) if not is_torch_version("==", inc_config.torch_version): diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index 026138553c..6e91cc26b7 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -65,7 +65,6 @@ from optimum.pipelines import ORT_SUPPORTED_TASKS if is_intel_extension_for_transformers_available(): - from optimum.intel.neural_compressor import ITREXAutoModelForCausalLM from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig os.environ["CUDA_VISIBLE_DEVICES"] = "" @@ -244,7 +243,7 @@ def test_weight_only_quantization(self, no_config, algo, weight_dtype): weight_only=True, # use RTN quantization method and NF4 weight data type is default. save_directory=tmp_dir, ) - q_model = ITREXAutoModelForCausalLM.from_pretrained(tmp_dir) + q_model = INCModelForCausalLM.from_pretrained(tmp_dir) inp = torch.tensor([calibration_dataset[0]["input_ids"]]) out = model(inp)[0] q_out = q_model(inp)[0]