From 1c99a65792c460f224c58923c122fa9442ffd54f Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 16 Dec 2024 10:56:11 +0100 Subject: [PATCH 1/4] Add a note about data-aware mixed precision assignment --- docs/source/openvino/export.mdx | 3 ++- optimum/commands/export/openvino.py | 3 ++- optimum/intel/openvino/configuration.py | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index 4876885219..e45c6a549b 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -78,7 +78,8 @@ Optional arguments: --ratio RATIO A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80% of the layers will be quantized to int4 while 20% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size - and inference latency. Default value is 1.0. + and inference latency. Default value is 1.0. Note: If dataset is provided, and the ration is + less than 1.0, then data-aware mixed precision assignment will be applied. --sym Whether to apply symmetric quantization --group-size GROUP_SIZE The group size to use for quantization. Recommended value is 128 and -1 uses per-column diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 61c21c5c72..1355e47be2 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -102,7 +102,8 @@ def parse_args_openvino(parser: "ArgumentParser"): default=None, help=( "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 " - "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0." + "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. " + "Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied." ), ) optional_group.add_argument( diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index a0fc68361c..0123e2e356 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -344,6 +344,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): ratio (`float`, defaults to 1.0): The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM and the rest to INT8_ASYM). + Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment + will be applied. all_layers (`bool`, *optional*): Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision. sensitivity_metric (`str`, *optional*): From 4028332c88b4e296ef29132f7ed9b1bbd56ba7ba Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 16 Dec 2024 11:03:20 +0100 Subject: [PATCH 2/4] Add a note to dataset parameter --- docs/source/openvino/export.mdx | 4 +++- optimum/commands/export/openvino.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index e45c6a549b..2b4ad4f05d 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -95,7 +95,9 @@ Optional arguments: can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the dataset will be collected from model's generations. For diffusion models it should be on of ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. For - visual language models the dataset must be set to 'contextual'. + visual language models the dataset must be set to 'contextual'. Note: if none of the data-aware + compression algorithms are selected and ratio parameter is omitted or equals 1.0, the dataset + argument will not have an effect on the resulting model. --all-layers Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight compression is applied, they are compressed to INT8. --awq Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 1355e47be2..6965efcb54 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -141,7 +141,9 @@ def parse_args_openvino(parser: "ArgumentParser"): "dataset will be collected from model's generations. " "For diffusion models it should be on of ['conceptual_captions'," "'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. " - "For visual language models the dataset must be set to 'contextual'." + "For visual language models the dataset must be set to 'contextual'. " + "Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or " + "equals 1.0, the dataset argument will not have an effect on the resulting model." ), ) optional_group.add_argument( From 40ee6cde7f34ecafe31d99d2ef68102f3f194f7d Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 16 Dec 2024 18:26:00 +0100 Subject: [PATCH 3/4] Update docs/source/openvino/export.mdx Co-authored-by: Helena Kloosterman --- docs/source/openvino/export.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index 2b4ad4f05d..3e7e458c02 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -78,7 +78,7 @@ Optional arguments: --ratio RATIO A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80% of the layers will be quantized to int4 while 20% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size - and inference latency. Default value is 1.0. Note: If dataset is provided, and the ration is + and inference latency. Default value is 1.0. Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied. --sym Whether to apply symmetric quantization --group-size GROUP_SIZE From 6c79118e1fa4f85077d34c8ec80f6a4cbd43a084 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 17 Dec 2024 10:18:11 +0100 Subject: [PATCH 4/4] Add a warning --- optimum/intel/openvino/configuration.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 0123e2e356..4fdfe368a2 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -443,7 +443,7 @@ def post_init(self): Safety checker that arguments are correct """ super().post_init() - if self.ratio is not None and not (0 <= self.ratio <= 1): + if not (0 <= self.ratio <= 1): raise ValueError("`ratio` must between 0 and 1.") if self.group_size is not None and self.group_size != -1 and self.group_size <= 0: raise ValueError("`group_size` must be greater than 0 or equal to -1") @@ -463,6 +463,18 @@ def post_init(self): or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}""" ) + if self.dataset is not None and not ( + self.quant_method == OVQuantizationMethod.AWQ + or self.scale_estimation + or self.gptq + or self.lora_correction + or (self.ratio < 1.0 and self.sensitivity_metric != nncf.SensitivityMetric.WEIGHT_QUANTIZATION_ERROR) + ): + logger.warning( + "The provided dataset won't have any effect on the resulting compressed model because no data-aware " + "quantization algorithm is selected and compression ratio is 1.0." + ) + if self.bits not in [4, 8]: raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}")