Skip to content

Commit

Permalink
Add a note about data-aware mixed precision assignment (#1075)
Browse files Browse the repository at this point in the history
* Add a note about data-aware mixed precision assignment

* Add a note to dataset parameter

* Update docs/source/openvino/export.mdx

Co-authored-by: Helena Kloosterman <[email protected]>

* Add a warning

---------

Co-authored-by: Helena Kloosterman <[email protected]>
  • Loading branch information
nikita-savelyevv and helena-intel authored Dec 17, 2024
1 parent 7601bfd commit f030583
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 5 deletions.
7 changes: 5 additions & 2 deletions docs/source/openvino/export.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ Optional arguments:
--ratio RATIO A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit
quantization. If set to 0.8, 80% of the layers will be quantized to int4 while 20% will be
quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size
and inference latency. Default value is 1.0.
and inference latency. Default value is 1.0. Note: If dataset is provided, and the ratio is
less than 1.0, then data-aware mixed precision assignment will be applied.
--sym Whether to apply symmetric quantization
--group-size GROUP_SIZE
The group size to use for quantization. Recommended value is 128 and -1 uses per-column
Expand All @@ -94,7 +95,9 @@ Optional arguments:
can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the dataset will
be collected from model's generations. For diffusion models it should be on of
['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. For
visual language models the dataset must be set to 'contextual'.
visual language models the dataset must be set to 'contextual'. Note: if none of the data-aware
compression algorithms are selected and ratio parameter is omitted or equals 1.0, the dataset
argument will not have an effect on the resulting model.
--all-layers Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an
weight compression is applied, they are compressed to INT8.
--awq Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but
Expand Down
7 changes: 5 additions & 2 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ def parse_args_openvino(parser: "ArgumentParser"):
default=None,
help=(
"A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 "
"while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0."
"while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. "
"Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied."
),
)
optional_group.add_argument(
Expand Down Expand Up @@ -140,7 +141,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
"dataset will be collected from model's generations. "
"For diffusion models it should be on of ['conceptual_captions',"
"'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. "
"For visual language models the dataset must be set to 'contextual'."
"For visual language models the dataset must be set to 'contextual'. "
"Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or "
"equals 1.0, the dataset argument will not have an effect on the resulting model."
),
)
optional_group.add_argument(
Expand Down
16 changes: 15 additions & 1 deletion optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
ratio (`float`, defaults to 1.0):
The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
and the rest to INT8_ASYM).
Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment
will be applied.
all_layers (`bool`, *optional*):
Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision.
sensitivity_metric (`str`, *optional*):
Expand Down Expand Up @@ -441,7 +443,7 @@ def post_init(self):
Safety checker that arguments are correct
"""
super().post_init()
if self.ratio is not None and not (0 <= self.ratio <= 1):
if not (0 <= self.ratio <= 1):
raise ValueError("`ratio` must between 0 and 1.")
if self.group_size is not None and self.group_size != -1 and self.group_size <= 0:
raise ValueError("`group_size` must be greater than 0 or equal to -1")
Expand All @@ -461,6 +463,18 @@ def post_init(self):
or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
)

if self.dataset is not None and not (
self.quant_method == OVQuantizationMethod.AWQ
or self.scale_estimation
or self.gptq
or self.lora_correction
or (self.ratio < 1.0 and self.sensitivity_metric != nncf.SensitivityMetric.WEIGHT_QUANTIZATION_ERROR)
):
logger.warning(
"The provided dataset won't have any effect on the resulting compressed model because no data-aware "
"quantization algorithm is selected and compression ratio is 1.0."
)

if self.bits not in [4, 8]:
raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}")

Expand Down

0 comments on commit f030583

Please sign in to comment.