diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 5b40c27984b..a0ad86f17cd 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -197,7 +197,7 @@ def convert_llm(model: torch.nn.Module, convert_model: bool=False, save_directory: str=None): # whether to set layernorm weight as const - layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1" + layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" if group_size == 0: n_splits_linear = 1 if qtype == "sym_int8_rtn": @@ -344,7 +344,7 @@ def convert_llm(model: torch.nn.Module, invalidInputError(False, "False to InitLLMPipeline.") elif model.config.model_type == "qwen2": - layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "0") == "1" + layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "0") == "1" with tempfile.TemporaryDirectory() as temp_dir: if save_directory is not None: temp_dir = save_directory @@ -426,9 +426,11 @@ def convert_llm_for_deploy(model: torch.nn.Module, os.mkdir(save_directory) weight_dir = os.path.join(save_directory, "model_weights") os.mkdir(weight_dir) + use_level_zero = os.environ.get("IPEX_LLM_NPU_USE_LEVEL0", "0") == "1" + layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" if model.config.model_type == "qwen2": - layernorm_const = True + layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "0") == "1" if model.config.hidden_size == 1536: # Qwen2-1.5B-Instruct fused_layers = 1 @@ -447,16 +449,28 @@ def convert_llm_for_deploy(model: torch.nn.Module, "weight_num": 7, "weight_idx": 8, "n_splits_linear": n_splits_linear, - "n_splits_down_proj": n_splits_down_proj} + "n_splits_down_proj": n_splits_down_proj, + "use_level_zero": use_level_zero} model.config.update(update_dict) model.config.save_pretrained(save_directory) from .qwen import convert_qwen_layer, convert_fused_qwen_layer from .qwen import convert_lm_head_and_embedding - # save fused_layers blobs of fused decoder layers - convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj, - save_directory, weight_dir, transpose_value_cache, kv_len, - group_size, layernorm_const, "decode") + if not use_level_zero: + # save fused_layers blobs of fused decoder layers + convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj, + save_directory, weight_dir, transpose_value_cache, kv_len, + group_size, layernorm_const, "decode") + else: + # save layer_num blobs of each decoder layer + layer_num = len(model.model.layers) + param_list = [] + for layer_idx in range(0, layer_num): + param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj, + save_directory, weight_dir, transpose_value_cache, kv_len, + group_size, layernorm_const)) + with Pool() as pool: + result = pool.starmap(convert_qwen_layer, param_list) # save blob of single prefill layer convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj, save_directory, weight_dir, transpose_value_cache, max_prompt_len, @@ -466,7 +480,6 @@ def convert_llm_for_deploy(model: torch.nn.Module, save_directory, weight_dir, convert_model=True) elif model.config.model_type == "llama": - layernorm_const = True embedding_post = False cos_sin_input = False use_prefill_sdp = False @@ -499,7 +512,8 @@ def convert_llm_for_deploy(model: torch.nn.Module, "embedding_post": embedding_post, "cos_sin_input": cos_sin_input, "n_splits_linear": n_splits_linear, - "n_splits_down_proj": n_splits_down_proj} + "n_splits_down_proj": n_splits_down_proj, + "use_level_zero": use_level_zero} model.config.update(update_dict) model.config.save_pretrained(save_directory) @@ -519,7 +533,6 @@ def convert_llm_for_deploy(model: torch.nn.Module, save_directory, weight_dir, transpose_value_cache, max_prompt_len, group_size, layernorm_const, "prefill") elif model.config.model_type == "minicpm": - layernorm_const = True fused_layers = 4 update_dict = {"kv_len": kv_len, "num_head": model.model.layers[0].self_attn.num_heads, @@ -536,7 +549,8 @@ def convert_llm_for_deploy(model: torch.nn.Module, "model_type": "minicpm", "embedding_post": True, "n_splits_linear": n_splits_linear, - "n_splits_down_proj": n_splits_down_proj} + "n_splits_down_proj": n_splits_down_proj, + "use_level_zero": use_level_zero} model.config.update(update_dict) model.config.save_pretrained(save_directory)