diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 383e1cc04..5e9af5d26 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -167,6 +167,16 @@ def parse_args_openvino(parser: "ArgumentParser"): "applying GPTQ takes additional memory and time." ), ) + optional_group.add_argument( + "--lora", + action="store_true", + default=None, + help=( + "Indicates whether to apply LoRA Correction algorithm. When enabled, this algorithm mitigates quantization " + "noise introduced during weight compression by leveraging low-rank adaptation. Please note, that applying " + "LoRA algorithm takes additional memory and time." + ), + ) optional_group.add_argument( "--sensitivity-metric", type=str, @@ -215,6 +225,7 @@ def no_compression_parameter_provided(args): args.awq, args.scale_estimation, args.gptq, + args.lora, args.sensitivity_metric, ) ) @@ -287,6 +298,7 @@ def run(self): "sensitivity_metric": self.args.sensitivity_metric, "scale_estimation": self.args.scale_estimation, "gptq": self.args.gptq, + "lora": self.args.lora, "weight_format": self.args.weight_format, } diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 1dba6c32f..c91a1d847 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -356,6 +356,11 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): - A string, the *model id* of a predefined processor hosted inside a model repo on huggingface.co. - A path to a *directory* containing files required by the processor, for instance saved using the [`~AutoProcessor.save_pretrained`] method, e.g., `./my_model_directory/`. + lora (`bool`, *optional*): + If True, apply LoRA Correction algorithm. When enabled, this algorithm mitigates quantization noise + introduced during weight compression by leveraging low-rank adaptation. It calculates low-rank matrices via + singular value decomposition (SVD) on the difference between the original and quantized weights. These + matrices are iteratively refined by solving a system of linear equations to improve accuracy. """ def __init__( @@ -376,6 +381,7 @@ def __init__( weight_format: Optional[str] = None, gptq: bool = None, processor: Optional[str] = None, + lora: bool = None, **kwargs, ): super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples) @@ -391,6 +397,7 @@ def __init__( self.weight_format = weight_format self.gptq = gptq self.processor = processor + self.lora = lora self.post_init() def post_init(self): @@ -464,14 +471,17 @@ def post_init(self): raise ValueError( f"When applying weight compression with '{self.weight_format}' weight format, the `bits` parameter must be set to 4, but found {self.bits}" ) - if self.quant_method == OVQuantizationMethod.AWQ: - raise ValueError(f"The AWQ algorithm is not supported for '{self.weight_format}' weight format") - if self.scale_estimation: - raise ValueError( - f"The Scale Estimation algorithm is not supported for '{self.weight_format}' weight format" - ) - if self.weight_format == "mxfp4" and self.gptq: - raise ValueError("The GPTQ algorithm is not supported for 'mxfp4' weight format") + if self.weight_format == "mxfp4": + if self.quant_method == OVQuantizationMethod.AWQ: + raise ValueError("The AWQ algorithm is not supported for 'mxpf4' weight format") + if self.scale_estimation: + raise ValueError("The Scale Estimation algorithm is not supported for 'mxpf4' weight format") + if self.gptq: + raise ValueError("The GPTQ algorithm is not supported for 'mxfp4' weight format") + if self.lora: + raise ValueError("The LoRA algorithm is not supported for 'mxfp4' weight format") + if self.gptq and self.lora: + raise ValueError("The GPTQ and LoRA algorithms can't be applied simultaneously") @dataclass diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 1b36c98b4..9fb05fc12 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -453,20 +453,13 @@ def _quantize_ovbasemodel( if calibration_dataset is None: raise ValueError("Calibration dataset is required to run quantization.") - # TODO: remove after update to NNCF 2.14 - model_type = nncf.ModelType(quantization_config.model_type) - ignored_scope = quantization_config.get_ignored_scope_instance() - if model_type == nncf.ModelType.TRANSFORMER: - ignored_scope.types += ["GroupNormalization"] - ignored_scope.validate = False - # Actual model quantization quantized_model = nncf.quantize( self.model.model, calibration_dataset, subset_size=quantization_config.num_samples, - ignored_scope=ignored_scope, - model_type=model_type, + ignored_scope=quantization_config.get_ignored_scope_instance(), + model_type=nncf.ModelType(quantization_config.model_type), preset=nncf.QuantizationPreset.PERFORMANCE if quantization_config.sym else nncf.QuantizationPreset.MIXED, fast_bias_correction=quantization_config.fast_bias_correction, advanced_parameters=nncf.AdvancedQuantizationParameters( @@ -951,6 +944,7 @@ def _weight_only_quantization( subset_size=config.num_samples if config.num_samples else 128, scale_estimation=config.scale_estimation, gptq=config.gptq, + lora_correction=config.lora, ) @@ -1027,10 +1021,6 @@ def _hybrid_quantization( ptq_ignored_scope = quantization_config.get_ignored_scope_instance() ptq_ignored_scope.names += ops_to_compress - # TODO: remove after update to NNCF 2.14 - ptq_ignored_scope.types += ["GroupNormalization"] - ptq_ignored_scope.validate = False - subset_size = quantization_config.num_samples if quantization_config.num_samples else 200 quantized_model = nncf.quantize( model=compressed_model, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 67511bb84..71504dd07 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -131,6 +131,12 @@ class OVCLIExportTestCase(unittest.TestCase): "int4 --ratio 1.0 --sym --group-size 16 --gptq --dataset wikitext2 --num-samples 100 ", {"int8": 4, "int4": 14}, ), + ( + "text-generation-with-past", + "llama_awq", + "int4 --ratio 1.0 --sym --group-size 16 --lora --dataset wikitext2 --num-samples 1", + {"int8": 4, "int4": 14}, + ), ] if is_transformers_version(">=", "4.40.0"): @@ -317,6 +323,7 @@ def test_exporters_cli_4bit(self, task: str, model_type: str, option: str, expec self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout) self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout) self.assertTrue("--gptq" not in option or b"Applying GPTQ" in result.stdout) + self.assertTrue("--lora" not in option or b"with correction of low-rank adapters" in result.stdout) def test_exporters_cli_int4_with_local_model_and_default_config(self): with TemporaryDirectory() as tmpdir: diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 2869acf83..22d0797ba 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -306,6 +306,18 @@ class OVWeightCompressionTest(unittest.TestCase): ), {"int4": 12, "int8": 8}, ), + ( + OVModelForCausalLM, + "llama_awq", + False, + dict( + bits=4, + num_samples=1, + dataset="c4", + lora=True, + ), + {"int4": 12, "int8": 8}, + ), ] if is_transformers_version(">=", "4.40.0"): @@ -685,6 +697,7 @@ def test_ovmodel_4bit_auto_compression_with_config( quantization_config.scale_estimation or False, wc_rt_info["scale_estimation"].value == "True" ) self.assertEqual(quantization_config.gptq or False, wc_rt_info["gptq"].value == "True") + self.assertEqual(quantization_config.lora or False, wc_rt_info["lora_correction"].value == "True") openvino_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(openvino_config.quantization_config.bits, 4)