From 4e911b4f358767b3c57ea7d182dd8393f6e7d3e7 Mon Sep 17 00:00:00 2001 From: Huanxing <41462385+ccrhx4@users.noreply.github.com> Date: Wed, 13 Nov 2024 15:52:16 +0800 Subject: [PATCH] do not use softmax fast mode in FusedSDPA (#26) --- vllm_hpu_extension/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_hpu_extension/ops.py b/vllm_hpu_extension/ops.py index 97ef7923..c024af6b 100644 --- a/vllm_hpu_extension/ops.py +++ b/vllm_hpu_extension/ops.py @@ -223,7 +223,7 @@ def prompt_attention( if query_heads != kv_heads: key = repeat_kv(key, int(query_heads // kv_heads)) value = repeat_kv(value, int(query_heads // kv_heads)) - softmax_mode = 'fast' + softmax_mode = 'None' recompute_mode = True attn_weights = FusedSDPA.apply(query, key, value, None, 0.0, True, scale, softmax_mode, recompute_mode,