From c52c37efe6017b079f7d4c2eca5e6c2697067d8f Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Thu, 2 Jan 2025 08:10:38 +0200 Subject: [PATCH] Handle cos-sin cache in every forward in case of long-context + LoRA --- vllm/model_executor/layers/rotary_embedding.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 86ede94abc5e9..a601189788441 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -232,7 +232,10 @@ def forward_hpu( ) -> Tuple[torch.Tensor, torch.Tensor]: from habana_frameworks.torch.hpex.kernels import ( RotaryPosEmbeddingMode, apply_rotary_pos_emb) - if not hasattr(self, "sin") or self.sin is None or offsets is not None: + + # Prepare cos-sin caches for long-context + LoRA with offsets for every + # forward, since the offset information wasn't available previously + if hasattr(self, "scaling_factors") or self.sin is None: self.prepare_cos_sin(positions, offsets) num_tokens = positions.shape[0] * positions.shape[1] # HPU RoPE kernel requires hidden dimension for cos and sin to be equal