From 24b46b2b19b770ea77cc285b9558663f82dc7635 Mon Sep 17 00:00:00 2001 From: Ruonan Wang Date: Tue, 26 Nov 2024 00:39:39 -0800 Subject: [PATCH] [NPU] further fix of qwen2 int8 pipeline & C++ (#12449) * fix * fix style --- python/llm/src/ipex_llm/transformers/npu_model.py | 3 ++- .../transformers/npu_pipeline_model/convert_pipeline.py | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index e57cd0ba760..eb684bce715 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -231,7 +231,7 @@ def optimize_npu_model(cls, *args, **kwargs): from intel_npu_acceleration_library.compiler import create_npu_kernels model = kwargs.pop("model") - qtype = kwargs.pop("qtype", "sym_int4") + qtype = kwargs.pop("qtype", "sym_int4_rtn") mixed_precision = kwargs.pop("mixed_precision", False) quantization_group_size = kwargs.pop("quantization_group_size", 0) modules_to_not_convert = kwargs.pop("modules_to_not_convert", []) @@ -280,6 +280,7 @@ def optimize_npu_model(cls, *args, **kwargs): max_prompt_len=max_prompt_len, transpose_value_cache=transpose_value_cache, group_size=quantization_group_size, + qtype=qtype, convert_model=convert_model, save_directory=save_directory) model.save_low_bit = types.MethodType(save_low_bit, model) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 84b02363452..50448bd684b 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -193,13 +193,18 @@ def convert_llm(model: torch.nn.Module, max_prompt_len: int, transpose_value_cache: bool, group_size: int, + qtype: str, convert_model: bool=False, save_directory: str=None): # whether to set layernorm weight as const layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1" if group_size == 0: n_splits_linear = 1 - n_splits_down_proj = 2 if model.config.intermediate_size == 18944 else 1 + if qtype == "sym_int8_rtn": + # do not split mlp down_proj for Qwen2-7B & sym_int8 + n_splits_down_proj = 1 + else: + n_splits_down_proj = 2 if model.config.intermediate_size == 18944 else 1 else: n_splits_linear = model.config.hidden_size // group_size n_splits_down_proj = model.config.intermediate_size // group_size