From 59edfe373caa60577754623f30538c931bf1c7a4 Mon Sep 17 00:00:00 2001 From: "Dong, Bo" Date: Wed, 17 Jul 2024 13:33:09 +0800 Subject: [PATCH] Support lmhead int4 Signed-off-by: Dong, Bo --- .../transformers/llm/quantization/utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 0678c2eb72e..c8d508d06fa 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -136,7 +136,8 @@ def replace_linear( if modules_to_not_convert is None: # output_layer is chatglm last layer name # embed_out is dolly_v2 last layer name - modules_to_not_convert = ["lm_head", "output_layer", "embed_out"] + #modules_to_not_convert = ["lm_head", "output_layer", "embed_out"] + modules_to_not_convert = [] if quantization_config.llm_int8_skip_modules: modules_to_not_convert = modules_to_not_convert.extend( quantization_config.llm_int8_skip_modules @@ -662,10 +663,10 @@ def convert_to_quantized_model(model, config, device="cpu"): iters=config.iters, scale_dtype=config.scale_dtype, ) - if config.quant_lm_head is False: - quant_config.set_local(".*lm_head", AutoRoundConfig(dtype="fp32")) - quant_config.set_local(".*output_layer", AutoRoundConfig(dtype="fp32")) - quant_config.set_local(".*embed_out", AutoRoundConfig(dtype="fp32")) + #if config.quant_lm_head is False: + # quant_config.set_local(".*lm_head", AutoRoundConfig(dtype="fp32")) + # quant_config.set_local(".*output_layer", AutoRoundConfig(dtype="fp32")) + # quant_config.set_local(".*embed_out", AutoRoundConfig(dtype="fp32")) logger.info(f"Do AutoRound algorithm with config {quant_config}") dataloader = get_autoround_dataloader(tokenizer=config.tokenizer, seqlen=config.seq_len,