From 8358d87ae7c83454f80bb5de9584eb83404c50ad Mon Sep 17 00:00:00 2001 From: Dudi Lester Date: Mon, 6 Jan 2025 14:43:19 +0200 Subject: [PATCH] [SW-199650] Add HPU fp8 DynamicMOE Op --- requirements-hpu.txt | 2 +- vllm/model_executor/layers/fused_moe/layer.py | 23 +++++-------------- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index ab4b823784bdc..873d2db93f90d 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@01090a8 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@87ab1b8 diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 634e57dafa4de..3b2354b394d9d 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -178,9 +178,7 @@ def forward_hpu( 'not supported on HPU') assert topk_group is None, 'topk_group is not supported on HPU' if layer is not None: - return layer.hpu_static_fused_moe(x, layer.w13_weight, - layer.w2_weight, router_logits, - top_k) + return layer.hpu_fused_moe(x, router_logits, top_k) def forward_cpu( self, @@ -300,15 +298,11 @@ def __init__( self.topk_group = topk_group self.custom_routing_function = custom_routing_function if is_hpu: - from vllm_hpu_extension.ops import DynamicFusedMOE, StaticFusedMOE + from vllm_hpu_extension.ops import DynamicFusedMOE + self.hpu_fused_moe = DynamicFusedMOE(self.num_experts) - from vllm.model_executor.layers.quantization.inc import INCConfig - selected_fused_moe = (StaticFusedMOE if isinstance( - quant_config, INCConfig) else DynamicFusedMOE) - self.hpu_static_fused_moe = selected_fused_moe(self.num_experts) self.scoring_func = scoring_func self.e_score_correction_bias = e_score_correction_bias - if self.scoring_func != "softmax" and not self.use_grouped_topk: raise ValueError("Only softmax scoring function is supported for " "non-grouped topk.") @@ -404,10 +398,8 @@ def _load_w13(self, expert_data.copy_(loaded_weight) if is_hpu: - from vllm_hpu_extension.ops import StaticFusedMOE - if isinstance(self.hpu_static_fused_moe, StaticFusedMOE): - self.hpu_static_fused_moe.w13_list[expert_id].set_weight( - orig_exp_data) + self.hpu_fused_moe.MoeOp.w13_list[expert_id].set_weight( + orig_exp_data) def _load_w2(self, expert_data: torch.Tensor, @@ -426,10 +418,7 @@ def _load_w2(self, # w2, down_proj: Load into only logical weight of w2. expert_data.copy_(loaded_weight) if is_hpu: - from vllm_hpu_extension.ops import StaticFusedMOE - if isinstance(self.hpu_static_fused_moe, StaticFusedMOE): - self.hpu_static_fused_moe.w2_list[expert_id].set_weight( - expert_data) + self.hpu_fused_moe.MoeOp.w2_list[expert_id].set_weight(expert_data) def _load_single_value(self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int):