From 9e438b505997ab0fc89385ac84ea5595a34e669e Mon Sep 17 00:00:00 2001 From: zhenwei-intel Date: Thu, 4 Jul 2024 10:09:30 +0800 Subject: [PATCH] update qconfig for xpu Signed-off-by: zhenwei-intel --- .../transformers/modeling/modeling_auto.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index c73edca48f8..1314e464eff 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -1872,7 +1872,10 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): # weight dtype is higher priority than bits in config.json when both existed. if quantization_config.weight_dtype is None: if quantization_config.bits == 4: - quantization_config.weight_dtype = "int4_clip" + if use_xpu: + quantization_config.weight_dtype = "int4_fullrange" + else: + quantization_config.weight_dtype = "int4_clip" logger.info( "{} quantization weight_dtype is used due to bits is 4 in config.json.".format( quantization_config.weight_dtype) @@ -1918,7 +1921,6 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): "fp4_e2m1", "fp4_e2m1_bnb", "nf4", - "int4_fullrange", ]: model = build_woq_model(model, quantization_config) else: @@ -2026,7 +2028,6 @@ def replace_ipex_cpu_woq_linear(model, current_name=[]): "nf4", "fp4_e2m1", "fp4_e2m1_bnb", - "int4_fullrange", ] and not quantization_config.use_ipex: model = replace_linear( model,