Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPU] Add L0 support for NPU C++ #12454

Merged
merged 3 commits into from
Nov 27, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def convert_llm(model: torch.nn.Module,
convert_model: bool=False,
save_directory: str=None):
# whether to set layernorm weight as const
layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1"
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
if group_size == 0:
n_splits_linear = 1
if qtype == "sym_int8_rtn":
Expand Down Expand Up @@ -344,7 +344,7 @@ def convert_llm(model: torch.nn.Module,
invalidInputError(False,
"False to InitLLMPipeline.")
elif model.config.model_type == "qwen2":
layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "0") == "1"
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "0") == "1"
with tempfile.TemporaryDirectory() as temp_dir:
if save_directory is not None:
temp_dir = save_directory
Expand Down Expand Up @@ -426,9 +426,11 @@ def convert_llm_for_deploy(model: torch.nn.Module,
os.mkdir(save_directory)
weight_dir = os.path.join(save_directory, "model_weights")
os.mkdir(weight_dir)
use_level_zero = os.environ.get("IPEX_LLM_NPU_USE_LEVEL0", "0") == "1"
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"

if model.config.model_type == "qwen2":
layernorm_const = True
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "0") == "1"
if model.config.hidden_size == 1536:
# Qwen2-1.5B-Instruct
fused_layers = 1
Expand All @@ -447,16 +449,28 @@ def convert_llm_for_deploy(model: torch.nn.Module,
"weight_num": 7,
"weight_idx": 8,
"n_splits_linear": n_splits_linear,
"n_splits_down_proj": n_splits_down_proj}
"n_splits_down_proj": n_splits_down_proj,
"use_level_zero": use_level_zero}
model.config.update(update_dict)
model.config.save_pretrained(save_directory)

from .qwen import convert_qwen_layer, convert_fused_qwen_layer
from .qwen import convert_lm_head_and_embedding
# save fused_layers blobs of fused decoder layers
convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
save_directory, weight_dir, transpose_value_cache, kv_len,
group_size, layernorm_const, "decode")
if not use_level_zero:
# save fused_layers blobs of fused decoder layers
convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
save_directory, weight_dir, transpose_value_cache, kv_len,
group_size, layernorm_const, "decode")
else:
# save layer_num blobs of each decoder layer
layer_num = len(model.model.layers)
param_list = []
for layer_idx in range(0, layer_num):
param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
save_directory, weight_dir, transpose_value_cache, kv_len,
group_size, layernorm_const))
with Pool() as pool:
result = pool.starmap(convert_qwen_layer, param_list)
# save blob of single prefill layer
convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
Expand All @@ -466,7 +480,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
save_directory, weight_dir,
convert_model=True)
elif model.config.model_type == "llama":
layernorm_const = True
embedding_post = False
cos_sin_input = False
use_prefill_sdp = False
Expand Down Expand Up @@ -499,7 +512,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
"embedding_post": embedding_post,
"cos_sin_input": cos_sin_input,
"n_splits_linear": n_splits_linear,
"n_splits_down_proj": n_splits_down_proj}
"n_splits_down_proj": n_splits_down_proj,
"use_level_zero": use_level_zero}
model.config.update(update_dict)
model.config.save_pretrained(save_directory)

Expand All @@ -519,7 +533,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
group_size, layernorm_const, "prefill")
elif model.config.model_type == "minicpm":
layernorm_const = True
fused_layers = 4
update_dict = {"kv_len": kv_len,
"num_head": model.model.layers[0].self_attn.num_heads,
Expand All @@ -536,7 +549,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
"model_type": "minicpm",
"embedding_post": True,
"n_splits_linear": n_splits_linear,
"n_splits_down_proj": n_splits_down_proj}
"n_splits_down_proj": n_splits_down_proj,
"use_level_zero": use_level_zero}
model.config.update(update_dict)
model.config.save_pretrained(save_directory)

Expand Down